1 /* 2 ** 2001 September 15 3 ** 4 ** The author disclaims copyright to this source code. In place of 5 ** a legal notice, here is a blessing: 6 ** 7 ** May you do good and not evil. 8 ** May you find forgiveness for yourself and forgive others. 9 ** May you share freely, never taking more than you give. 10 ** 11 ************************************************************************* 12 ** A tokenizer for SQL 13 ** 14 ** This file contains C code that splits an SQL input string up into 15 ** individual tokens and sends those tokens one-by-one over to the 16 ** parser for analysis. 17 */ 18 19 #include <ctype.h> 20 #include <stdarg.h> 21 #include <stdlib.h> 22 23 #include "windef.h" 24 #include "winbase.h" 25 #include "query.h" 26 #include "sql.tab.h" 27 28 /* 29 ** All the keywords of the SQL language are stored as in a hash 30 ** table composed of instances of the following structure. 31 */ 32 struct keyword 33 { 34 const WCHAR *name; /* The keyword name */ 35 unsigned int len; 36 int tokenType; /* The token value for this keyword */ 37 }; 38 39 #define MAX_TOKEN_LEN 11 40 41 /* 42 ** These are the keywords 43 ** They MUST be in alphabetical order 44 */ 45 #define X(str) str, ARRAY_SIZE(str) - 1 46 static const struct keyword aKeywordTable[] = { 47 { X(L"ADD"), TK_ADD }, 48 { X(L"ALTER"), TK_ALTER }, 49 { X(L"AND"), TK_AND }, 50 { X(L"BY"), TK_BY }, 51 { X(L"CHAR"), TK_CHAR }, 52 { X(L"CHARACTER"), TK_CHAR }, 53 { X(L"CREATE"), TK_CREATE }, 54 { X(L"DELETE"), TK_DELETE }, 55 { X(L"DISTINCT"), TK_DISTINCT }, 56 { X(L"DROP"), TK_DROP }, 57 { X(L"FREE"), TK_FREE }, 58 { X(L"FROM"), TK_FROM }, 59 { X(L"HOLD"), TK_HOLD }, 60 { X(L"INSERT"), TK_INSERT }, 61 { X(L"INT"), TK_INT }, 62 { X(L"INTEGER"), TK_INT }, 63 { X(L"INTO"), TK_INTO }, 64 { X(L"IS"), TK_IS }, 65 { X(L"KEY"), TK_KEY }, 66 { X(L"LIKE"), TK_LIKE }, 67 { X(L"LOCALIZABLE"), TK_LOCALIZABLE }, 68 { X(L"LONG"), TK_LONG }, 69 { X(L"LONGCHAR"), TK_LONGCHAR }, 70 { X(L"NOT"), TK_NOT }, 71 { X(L"NULL"), TK_NULL }, 72 { X(L"OBJECT"), TK_OBJECT }, 73 { X(L"OR"), TK_OR }, 74 { X(L"ORDER"), TK_ORDER }, 75 { X(L"PRIMARY"), TK_PRIMARY }, 76 { X(L"SELECT"), TK_SELECT }, 77 { X(L"SET"), TK_SET }, 78 { X(L"SHORT"), TK_SHORT }, 79 { X(L"TABLE"), TK_TABLE }, 80 { X(L"TEMPORARY"), TK_TEMPORARY }, 81 { X(L"UPDATE"), TK_UPDATE }, 82 { X(L"VALUES"), TK_VALUES }, 83 { X(L"WHERE"), TK_WHERE }, 84 }; 85 #undef X 86 87 /* 88 ** Comparison function for binary search. 89 */ 90 static int __cdecl compKeyword(const void *m1, const void *m2){ 91 const struct keyword *k1 = m1, *k2 = m2; 92 int ret, len = min( k1->len, k2->len ); 93 94 if ((ret = wcsnicmp( k1->name, k2->name, len ))) return ret; 95 if (k1->len < k2->len) return -1; 96 else if (k1->len > k2->len) return 1; 97 return 0; 98 } 99 100 /* 101 ** This function looks up an identifier to determine if it is a 102 ** keyword. If it is a keyword, the token code of that keyword is 103 ** returned. If the input is not a keyword, TK_ID is returned. 104 */ 105 static int sqliteKeywordCode(const WCHAR *z, int n){ 106 struct keyword key, *r; 107 108 if( n>MAX_TOKEN_LEN ) 109 return TK_ID; 110 111 key.tokenType = 0; 112 key.name = z; 113 key.len = n; 114 r = bsearch( &key, aKeywordTable, ARRAY_SIZE(aKeywordTable), sizeof(struct keyword), compKeyword ); 115 if( r ) 116 return r->tokenType; 117 return TK_ID; 118 } 119 120 121 /* 122 ** If X is a character that can be used in an identifier then 123 ** isIdChar[X] will be 1. Otherwise isIdChar[X] will be 0. 124 ** 125 ** In this implementation, an identifier can be a string of 126 ** alphabetic characters, digits, and "_" plus any character 127 ** with the high-order bit set. The latter rule means that 128 ** any sequence of UTF-8 characters or characters taken from 129 ** an extended ISO8859 character set can form an identifier. 130 */ 131 static const char isIdChar[] = { 132 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */ 133 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x */ 134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1x */ 135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, /* 2x */ 136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */ 137 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */ 138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */ 139 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */ 140 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */ 141 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 8x */ 142 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 9x */ 143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Ax */ 144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Bx */ 145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Cx */ 146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Dx */ 147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Ex */ 148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* Fx */ 149 }; 150 151 /* 152 ** WCHAR safe version of isdigit() 153 */ 154 static inline int isDigit(WCHAR c) 155 { 156 return c >= '0' && c <= '9'; 157 } 158 159 /* 160 ** WCHAR safe version of isspace(), except '\r' 161 */ 162 static inline int isSpace(WCHAR c) 163 { 164 return c == ' ' || c == '\t' || c == '\n' || c == '\f'; 165 } 166 167 /* 168 ** Return the length of the token that begins at z[0]. Return 169 ** -1 if the token is (or might be) incomplete. Store the token 170 ** type in *tokenType before returning. 171 */ 172 int sqliteGetToken(const WCHAR *z, int *tokenType, int *skip){ 173 int i; 174 175 *skip = 0; 176 switch( *z ){ 177 case ' ': case '\t': case '\n': case '\f': 178 for(i=1; isSpace(z[i]); i++){} 179 *tokenType = TK_SPACE; 180 return i; 181 case '-': 182 if( z[1]==0 ) return -1; 183 *tokenType = TK_MINUS; 184 return 1; 185 case '(': 186 *tokenType = TK_LP; 187 return 1; 188 case ')': 189 *tokenType = TK_RP; 190 return 1; 191 case '*': 192 *tokenType = TK_STAR; 193 return 1; 194 case '=': 195 *tokenType = TK_EQ; 196 return 1; 197 case '<': 198 if( z[1]=='=' ){ 199 *tokenType = TK_LE; 200 return 2; 201 }else if( z[1]=='>' ){ 202 *tokenType = TK_NE; 203 return 2; 204 }else{ 205 *tokenType = TK_LT; 206 return 1; 207 } 208 case '>': 209 if( z[1]=='=' ){ 210 *tokenType = TK_GE; 211 return 2; 212 }else{ 213 *tokenType = TK_GT; 214 return 1; 215 } 216 case '!': 217 if( z[1]!='=' ){ 218 *tokenType = TK_ILLEGAL; 219 return 2; 220 }else{ 221 *tokenType = TK_NE; 222 return 2; 223 } 224 case '?': 225 *tokenType = TK_WILDCARD; 226 return 1; 227 case ',': 228 *tokenType = TK_COMMA; 229 return 1; 230 case '`': case '\'': { 231 int delim = z[0]; 232 for(i=1; z[i]; i++){ 233 if( z[i]==delim ) 234 break; 235 } 236 if( z[i] ) i++; 237 if( delim == '`' ) 238 *tokenType = TK_ID; 239 else 240 *tokenType = TK_STRING; 241 return i; 242 } 243 case '.': 244 if( !isDigit(z[1]) ){ 245 *tokenType = TK_DOT; 246 return 1; 247 } 248 /* Fall through */ 249 case '0': case '1': case '2': case '3': case '4': 250 case '5': case '6': case '7': case '8': case '9': 251 *tokenType = TK_INTEGER; 252 for(i=1; isDigit(z[i]); i++){} 253 return i; 254 case '[': 255 for(i=1; z[i] && z[i-1]!=']'; i++){} 256 *tokenType = TK_ID; 257 return i; 258 default: 259 if( !isIdChar[*z] ){ 260 break; 261 } 262 for(i=1; isIdChar[z[i]]; i++){} 263 *tokenType = sqliteKeywordCode(z, i); 264 if( *tokenType == TK_ID && z[i] == '`' ) *skip = 1; 265 return i; 266 } 267 *tokenType = TK_ILLEGAL; 268 return 1; 269 } 270