1 /*
2 ** 2012 May 25
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13 
14 /*
15 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
16 */
17 
18 #ifndef SQLITE_DISABLE_FTS3_UNICODE
19 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
20 
21 #include <assert.h>
22 
23 /*
24 ** Return true if the argument corresponds to a unicode codepoint
25 ** classified as either a letter or a number. Otherwise false.
26 **
27 ** The results are undefined if the value passed to this function
28 ** is less than zero.
29 */
sqlite3FtsUnicodeIsalnum(int c)30 int sqlite3FtsUnicodeIsalnum(int c){
31   /* Each unsigned integer in the following array corresponds to a contiguous
32   ** range of unicode codepoints that are not either letters or numbers (i.e.
33   ** codepoints for which this function should return 0).
34   **
35   ** The most significant 22 bits in each 32-bit value contain the first
36   ** codepoint in the range. The least significant 10 bits are used to store
37   ** the size of the range (always at least 1). In other words, the value
38   ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
39   ** C. It is not possible to represent a range larger than 1023 codepoints
40   ** using this format.
41   */
42   static const unsigned int aEntry[] = {
43     0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
44     0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
45     0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
46     0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
47     0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
48     0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
49     0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
50     0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
51     0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
52     0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
53     0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
54     0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
55     0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
56     0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
57     0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
58     0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
59     0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
60     0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
61     0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
62     0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
63     0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
64     0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
65     0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
66     0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
67     0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
68     0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
69     0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
70     0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
71     0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
72     0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
73     0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
74     0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
75     0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
76     0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
77     0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
78     0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
79     0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
80     0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
81     0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
82     0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
83     0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
84     0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
85     0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
86     0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
87     0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
88     0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
89     0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
90     0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
91     0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
92     0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
93     0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
94     0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
95     0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
96     0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
97     0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
98     0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
99     0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
100     0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
101     0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
102     0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
103     0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
104     0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
105     0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
106     0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
107     0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
108     0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
109     0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
110     0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
111     0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
112     0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
113     0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
114     0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
115     0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
116     0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
117     0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
118     0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
119     0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
120     0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
121     0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
122     0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
123     0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
124     0x380400F0,
125   };
126   static const unsigned int aAscii[4] = {
127     0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
128   };
129 
130   if( (unsigned int)c<128 ){
131     return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
132   }else if( (unsigned int)c<(1<<22) ){
133     unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
134     int iRes = 0;
135     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
136     int iLo = 0;
137     while( iHi>=iLo ){
138       int iTest = (iHi + iLo) / 2;
139       if( key >= aEntry[iTest] ){
140         iRes = iTest;
141         iLo = iTest+1;
142       }else{
143         iHi = iTest-1;
144       }
145     }
146     assert( aEntry[0]<key );
147     assert( key>=aEntry[iRes] );
148     return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
149   }
150   return 1;
151 }
152 
153 
154 /*
155 ** If the argument is a codepoint corresponding to a lowercase letter
156 ** in the ASCII range with a diacritic added, return the codepoint
157 ** of the ASCII letter only. For example, if passed 235 - "LATIN
158 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
159 ** E"). The resuls of passing a codepoint that corresponds to an
160 ** uppercase letter are undefined.
161 */
remove_diacritic(int c)162 static int remove_diacritic(int c){
163   unsigned short aDia[] = {
164         0,  1797,  1848,  1859,  1891,  1928,  1940,  1995,
165      2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286,
166      2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732,
167      2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336,
168      3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928,
169      3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234,
170      4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504,
171      6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529,
172     61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
173     61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
174     62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
175     62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
176     62924, 63050, 63082, 63274, 63390,
177   };
178   char aChar[] = {
179     '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',
180     'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',
181     's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',
182     'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',
183     'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0',
184     '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',
185     'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',
186     'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',
187     'e',  'i',  'o',  'u',  'y',
188   };
189 
190   unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
191   int iRes = 0;
192   int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
193   int iLo = 0;
194   while( iHi>=iLo ){
195     int iTest = (iHi + iLo) / 2;
196     if( key >= aDia[iTest] ){
197       iRes = iTest;
198       iLo = iTest+1;
199     }else{
200       iHi = iTest-1;
201     }
202   }
203   assert( key>=aDia[iRes] );
204   return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
205 }
206 
207 
208 /*
209 ** Return true if the argument interpreted as a unicode codepoint
210 ** is a diacritical modifier character.
211 */
sqlite3FtsUnicodeIsdiacritic(int c)212 int sqlite3FtsUnicodeIsdiacritic(int c){
213   unsigned int mask0 = 0x08029FDF;
214   unsigned int mask1 = 0x000361F8;
215   if( c<768 || c>817 ) return 0;
216   return (c < 768+32) ?
217       (mask0 & (1 << (c-768))) :
218       (mask1 & (1 << (c-768-32)));
219 }
220 
221 
222 /*
223 ** Interpret the argument as a unicode codepoint. If the codepoint
224 ** is an upper case character that has a lower case equivalent,
225 ** return the codepoint corresponding to the lower case version.
226 ** Otherwise, return a copy of the argument.
227 **
228 ** The results are undefined if the value passed to this function
229 ** is less than zero.
230 */
sqlite3FtsUnicodeFold(int c,int bRemoveDiacritic)231 int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
232   /* Each entry in the following array defines a rule for folding a range
233   ** of codepoints to lower case. The rule applies to a range of nRange
234   ** codepoints starting at codepoint iCode.
235   **
236   ** If the least significant bit in flags is clear, then the rule applies
237   ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
238   ** need to be folded). Or, if it is set, then the rule only applies to
239   ** every second codepoint in the range, starting with codepoint C.
240   **
241   ** The 7 most significant bits in flags are an index into the aiOff[]
242   ** array. If a specific codepoint C does require folding, then its lower
243   ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
244   **
245   ** The contents of this array are generated by parsing the CaseFolding.txt
246   ** file distributed as part of the "Unicode Character Database". See
247   ** http://www.unicode.org for details.
248   */
249   static const struct TableEntry {
250     unsigned short iCode;
251     unsigned char flags;
252     unsigned char nRange;
253   } aEntry[] = {
254     {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
255     {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
256     {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
257     {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
258     {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
259     {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
260     {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
261     {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
262     {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
263     {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
264     {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
265     {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
266     {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
267     {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
268     {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
269     {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
270     {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
271     {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
272     {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
273     {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
274     {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
275     {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
276     {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
277     {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
278     {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
279     {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
280     {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
281     {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
282     {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
283     {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
284     {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
285     {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
286     {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
287     {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
288     {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
289     {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
290     {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
291     {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
292     {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
293     {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
294     {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
295     {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
296     {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
297     {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
298     {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
299     {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
300     {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
301     {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
302     {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
303     {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
304     {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
305     {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
306     {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
307     {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
308     {65313, 14, 26},
309   };
310   static const unsigned short aiOff[] = {
311    1,     2,     8,     15,    16,    26,    28,    32,
312    37,    38,    40,    48,    63,    64,    69,    71,
313    79,    80,    116,   202,   203,   205,   206,   207,
314    209,   210,   211,   213,   214,   217,   218,   219,
315    775,   7264,  10792, 10795, 23228, 23256, 30204, 54721,
316    54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
317    57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
318    65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
319    65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
320    65514, 65521, 65527, 65528, 65529,
321   };
322 
323   int ret = c;
324 
325   assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
326 
327   if( c<128 ){
328     if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
329   }else if( c<65536 ){
330     const struct TableEntry *p;
331     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
332     int iLo = 0;
333     int iRes = -1;
334 
335     assert( c>aEntry[0].iCode );
336     while( iHi>=iLo ){
337       int iTest = (iHi + iLo) / 2;
338       int cmp = (c - aEntry[iTest].iCode);
339       if( cmp>=0 ){
340         iRes = iTest;
341         iLo = iTest+1;
342       }else{
343         iHi = iTest-1;
344       }
345     }
346 
347     assert( iRes>=0 && c>=aEntry[iRes].iCode );
348     p = &aEntry[iRes];
349     if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
350       ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
351       assert( ret>0 );
352     }
353 
354     if( bRemoveDiacritic ) ret = remove_diacritic(ret);
355   }
356 
357   else if( c>=66560 && c<66600 ){
358     ret = c + 40;
359   }
360 
361   return ret;
362 }
363 #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
364 #endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */
365