1 /*
2 ** 2012 May 25
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 ******************************************************************************
12 */
13 
14 /*
15 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
16 */
17 
18 
19 #include <assert.h>
20 
21 /*
22 ** Return true if the argument corresponds to a unicode codepoint
23 ** classified as either a letter or a number. Otherwise false.
24 **
25 ** The results are undefined if the value passed to this function
26 ** is less than zero.
27 */
sqlite3Fts5UnicodeIsalnum(int c)28 int sqlite3Fts5UnicodeIsalnum(int c){
29   /* Each unsigned integer in the following array corresponds to a contiguous
30   ** range of unicode codepoints that are not either letters or numbers (i.e.
31   ** codepoints for which this function should return 0).
32   **
33   ** The most significant 22 bits in each 32-bit value contain the first
34   ** codepoint in the range. The least significant 10 bits are used to store
35   ** the size of the range (always at least 1). In other words, the value
36   ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
37   ** C. It is not possible to represent a range larger than 1023 codepoints
38   ** using this format.
39   */
40   static const unsigned int aEntry[] = {
41     0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
42     0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
43     0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
44     0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
45     0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
46     0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
47     0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
48     0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
49     0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
50     0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
51     0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
52     0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
53     0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
54     0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
55     0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
56     0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
57     0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
58     0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
59     0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
60     0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
61     0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
62     0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
63     0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
64     0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
65     0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
66     0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
67     0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
68     0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
69     0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
70     0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
71     0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
72     0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
73     0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
74     0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
75     0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
76     0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
77     0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
78     0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
79     0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
80     0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
81     0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
82     0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
83     0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
84     0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
85     0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
86     0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
87     0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
88     0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
89     0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
90     0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
91     0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
92     0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
93     0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
94     0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
95     0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
96     0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
97     0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
98     0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
99     0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
100     0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
101     0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
102     0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802,
103     0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013,
104     0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06,
105     0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003,
106     0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01,
107     0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403,
108     0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009,
109     0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003,
110     0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003,
111     0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E,
112     0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046,
113     0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401,
114     0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401,
115     0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F,
116     0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C,
117     0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002,
118     0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025,
119     0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6,
120     0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46,
121     0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060,
122     0x380400F0,
123   };
124   static const unsigned int aAscii[4] = {
125     0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
126   };
127 
128   if( (unsigned int)c<128 ){
129     return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
130   }else if( (unsigned int)c<(1<<22) ){
131     unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
132     int iRes = 0;
133     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
134     int iLo = 0;
135     while( iHi>=iLo ){
136       int iTest = (iHi + iLo) / 2;
137       if( key >= aEntry[iTest] ){
138         iRes = iTest;
139         iLo = iTest+1;
140       }else{
141         iHi = iTest-1;
142       }
143     }
144     assert( aEntry[0]<key );
145     assert( key>=aEntry[iRes] );
146     return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
147   }
148   return 1;
149 }
150 
151 
152 /*
153 ** If the argument is a codepoint corresponding to a lowercase letter
154 ** in the ASCII range with a diacritic added, return the codepoint
155 ** of the ASCII letter only. For example, if passed 235 - "LATIN
156 ** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
157 ** E"). The resuls of passing a codepoint that corresponds to an
158 ** uppercase letter are undefined.
159 */
fts5_remove_diacritic(int c)160 static int fts5_remove_diacritic(int c){
161   unsigned short aDia[] = {
162         0,  1797,  1848,  1859,  1891,  1928,  1940,  1995,
163      2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286,
164      2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732,
165      2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336,
166      3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928,
167      3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234,
168      4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504,
169      6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529,
170     61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726,
171     61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122,
172     62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536,
173     62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730,
174     62924, 63050, 63082, 63274, 63390,
175   };
176   char aChar[] = {
177     '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',
178     'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',
179     's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',
180     'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',
181     'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0',
182     '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',
183     'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',
184     'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',
185     'e',  'i',  'o',  'u',  'y',
186   };
187 
188   unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
189   int iRes = 0;
190   int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
191   int iLo = 0;
192   while( iHi>=iLo ){
193     int iTest = (iHi + iLo) / 2;
194     if( key >= aDia[iTest] ){
195       iRes = iTest;
196       iLo = iTest+1;
197     }else{
198       iHi = iTest-1;
199     }
200   }
201   assert( key>=aDia[iRes] );
202   return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
203 }
204 
205 
206 /*
207 ** Return true if the argument interpreted as a unicode codepoint
208 ** is a diacritical modifier character.
209 */
sqlite3Fts5UnicodeIsdiacritic(int c)210 int sqlite3Fts5UnicodeIsdiacritic(int c){
211   unsigned int mask0 = 0x08029FDF;
212   unsigned int mask1 = 0x000361F8;
213   if( c<768 || c>817 ) return 0;
214   return (c < 768+32) ?
215       (mask0 & (1 << (c-768))) :
216       (mask1 & (1 << (c-768-32)));
217 }
218 
219 
220 /*
221 ** Interpret the argument as a unicode codepoint. If the codepoint
222 ** is an upper case character that has a lower case equivalent,
223 ** return the codepoint corresponding to the lower case version.
224 ** Otherwise, return a copy of the argument.
225 **
226 ** The results are undefined if the value passed to this function
227 ** is less than zero.
228 */
sqlite3Fts5UnicodeFold(int c,int bRemoveDiacritic)229 int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
230   /* Each entry in the following array defines a rule for folding a range
231   ** of codepoints to lower case. The rule applies to a range of nRange
232   ** codepoints starting at codepoint iCode.
233   **
234   ** If the least significant bit in flags is clear, then the rule applies
235   ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
236   ** need to be folded). Or, if it is set, then the rule only applies to
237   ** every second codepoint in the range, starting with codepoint C.
238   **
239   ** The 7 most significant bits in flags are an index into the aiOff[]
240   ** array. If a specific codepoint C does require folding, then its lower
241   ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
242   **
243   ** The contents of this array are generated by parsing the CaseFolding.txt
244   ** file distributed as part of the "Unicode Character Database". See
245   ** http://www.unicode.org for details.
246   */
247   static const struct TableEntry {
248     unsigned short iCode;
249     unsigned char flags;
250     unsigned char nRange;
251   } aEntry[] = {
252     {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
253     {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
254     {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
255     {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
256     {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
257     {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
258     {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
259     {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
260     {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
261     {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
262     {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
263     {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
264     {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
265     {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
266     {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
267     {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
268     {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
269     {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
270     {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
271     {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
272     {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
273     {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
274     {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
275     {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
276     {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
277     {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
278     {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
279     {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
280     {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
281     {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
282     {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
283     {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
284     {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
285     {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
286     {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
287     {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
288     {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
289     {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
290     {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
291     {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
292     {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
293     {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
294     {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
295     {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
296     {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
297     {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
298     {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
299     {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
300     {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
301     {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
302     {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
303     {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
304     {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
305     {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
306     {65313, 14, 26},
307   };
308   static const unsigned short aiOff[] = {
309    1,     2,     8,     15,    16,    26,    28,    32,
310    37,    38,    40,    48,    63,    64,    69,    71,
311    79,    80,    116,   202,   203,   205,   206,   207,
312    209,   210,   211,   213,   214,   217,   218,   219,
313    775,   7264,  10792, 10795, 23228, 23256, 30204, 54721,
314    54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274,
315    57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406,
316    65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462,
317    65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511,
318    65514, 65521, 65527, 65528, 65529,
319   };
320 
321   int ret = c;
322 
323   assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
324 
325   if( c<128 ){
326     if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
327   }else if( c<65536 ){
328     const struct TableEntry *p;
329     int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
330     int iLo = 0;
331     int iRes = -1;
332 
333     assert( c>aEntry[0].iCode );
334     while( iHi>=iLo ){
335       int iTest = (iHi + iLo) / 2;
336       int cmp = (c - aEntry[iTest].iCode);
337       if( cmp>=0 ){
338         iRes = iTest;
339         iLo = iTest+1;
340       }else{
341         iHi = iTest-1;
342       }
343     }
344 
345     assert( iRes>=0 && c>=aEntry[iRes].iCode );
346     p = &aEntry[iRes];
347     if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
348       ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
349       assert( ret>0 );
350     }
351 
352     if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
353   }
354 
355   else if( c>=66560 && c<66600 ){
356     ret = c + 40;
357   }
358 
359   return ret;
360 }
361