1 /******************************************************************************
2 *
3 * utilstr.h - prototypes for string utility functions
4 *
5 * $Id: utilstr.h 3515 2017-11-01 11:38:09Z scribe $
6 *
7 * Copyright 1997-2013 CrossWire Bible Society (http://www.crosswire.org)
8 * CrossWire Bible Society
9 * P. O. Box 2528
10 * Tempe, AZ 85280-2528
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the
14 * Free Software Foundation version 2.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 */
22
23 #ifndef UTILSTR_H
24 #define UTILSTR_H
25
26 #include <defs.h>
27 #include <sysdata.h>
28 #include <swbuf.h>
29
30 SWORD_NAMESPACE_START
31
32
33 /******************************************************************************
34 * stdstr - clones a string
35 *
36 * ENT: ipstr - pointer to a string pointer to set if necessary
37 * istr - string to set to *ipstr
38 * 0 - only get
39 *
40 * RET: *ipstr
41 */
42
43 inline char *stdstr(char **ipstr, const char *istr, unsigned int memPadFactor = 1) {
44 if (*ipstr)
45 delete [] *ipstr;
46 if (istr) {
47 int len = (int)strlen(istr) + 1;
48 *ipstr = new char [ len * memPadFactor ];
49 memcpy(*ipstr, istr, len);
50 }
51 else *ipstr = 0;
52 return *ipstr;
53 }
54
55 SWDLLEXPORT char *strstrip (char *istr);
56 SWDLLEXPORT const char *stristr (const char *s1, const char *s2);
57 SWDLLEXPORT int strnicmp(const char *s1, const char *s2, int len);
58 SWDLLEXPORT int stricmp(const char *s1, const char *s2);
59
60 /******************************************************************************
61 * SW_toupper - array of uppercase values for any given Latin-1 value
62 *
63 * use this instead of toupper() for fast lookups on accented characters
64 */
65 extern const unsigned char SW_toupper_array[256];
66 #define SW_toupper(c) SW_toupper_array[(unsigned char)c]
67
68 /******************************************************************************
69 * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
70 * and increments buf to start of next codepoint
71 *
72 * ENT: buf - address of a utf8 buffer
73 *
74 * RET: buf - incremented past last byte used in computing the current codepoint
75 * unicode codepoint value (0 with buf incremented is invalid UTF8 byte
76 */
77
78
79 /******************************************************************************
80 * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
81 * and increments buf to start of next codepoint
82 *
83 * ENT: buf - address of a utf8 buffer
84 *
85 * RET: buf - incremented past last byte used in computing the current codepoint
86 * unicode codepoint value (0 with buf incremented is invalid UTF8 byte
87 */
88
89 inline __u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation = false) {
90 __u32 ch = 0;
91
92 //case: We're at the end
93 if (!(**buf)) {
94 return ch;
95 }
96
97 //case: ANSI
98 if (!(**buf & 128)) {
99 ch = **buf;
100 (*buf)++;
101 return ch;
102 }
103
104 //case: Invalid UTF-8 (illegal continuing byte in initial position)
105 if ((**buf >> 6) == 2) {
106 (*buf)++;
107 return ch;
108 }
109
110
111 //case: 2+ byte codepoint
112 int subsequent = 1;
113 if ((**buf & 32) == 0) { subsequent = 1; }
114 else if ((**buf & 16) == 0) { subsequent = 2; }
115 else if ((**buf & 8) == 0) { subsequent = 3; }
116 else if ((**buf & 4) == 0) { subsequent = 4; }
117 else if ((**buf & 2) == 0) { subsequent = 5; }
118 else if ((**buf & 1) == 0) { subsequent = 6; }
119 else subsequent = 7; // is this legal?
120
121 ch = **buf & (0xFF>>(subsequent + 1));
122
123 for (int i = 1; i <= subsequent; ++i) {
124 // subsequent byte did not begin with 10XXXXXX
125 // move our buffer to here and error out
126 // this also catches our null if we hit the string terminator
127 if (((*buf)[i] >> 6) != 2) {
128 *buf += i;
129 return 0;
130 }
131 ch <<= 6;
132 ch |= (*buf)[i] & 63;
133 }
134 *buf += (subsequent+1);
135
136 if (!skipValidation) {
137 // I THINK THIS IS STUPID BUT THE SPEC SAYS NO MORE THAN 4 BYTES
138 if (subsequent > 3) ch = 0;
139 // AGAIN stupid, but spec says UTF-8 can't use more than 21 bits
140 if (ch > 0x1FFFFF) ch = 0;
141 // This would be out of Unicode bounds
142 if (ch > 0x10FFFF) ch = 0;
143 // these would be values which could be represented in less bytes
144 if (ch < 0x80 && subsequent > 0) ch = 0;
145 if (ch < 0x800 && subsequent > 1) ch = 0;
146 if (ch < 0x10000 && subsequent > 2) ch = 0;
147 if (ch < 0x200000 && subsequent > 3) ch = 0;
148 }
149
150 return ch;
151 }
152
153
154 /******************************************************************************
155 * getUTF8FromUniChar - retrieves us UTF8 string from a
156 * Unicode codepoint
157 *
158 * ENT: uchar - unicode codepoint value
159 *
160 * RET: buf - a UTF8 string which consists of the proper UTF8 sequence of
161 * bytes for the given Unicode codepoint
162 * NOTE: for speed and thread safety, this method now requires a buffer
163 * to work with
164 */
165
getUTF8FromUniChar(__u32 uchar,SWBuf * appendTo)166 inline SWBuf *getUTF8FromUniChar(__u32 uchar, SWBuf *appendTo) {
167 unsigned long base = appendTo->size();
168
169 // This would be out of Unicode bounds
170 if (uchar > 0x10FFFF) uchar = 0xFFFD;
171 char bytes = uchar < 0x80 ? 1 : uchar < 0x800 ? 2 : uchar < 0x10000 ? 3 : 4;
172 appendTo->setSize(base+bytes);
173 switch (bytes) {
174 case 1:
175 (*appendTo)[base ] = (unsigned char)uchar;
176 break;
177 case 2:
178 (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
179 uchar >>= 6;
180 (*appendTo)[base ] = (unsigned char)(0xc0 | (uchar & 0x1f));
181 break;
182 case 3:
183 (*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
184 uchar >>= 6;
185 (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
186 uchar >>= 6;
187 (*appendTo)[base ] = (unsigned char)(0xe0 | (uchar & 0x0f));
188 break;
189 case 4:
190 (*appendTo)[base+3] = (unsigned char)(0x80 | (uchar & 0x3f));
191 uchar >>= 6;
192 (*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
193 uchar >>= 6;
194 (*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
195 uchar >>= 6;
196 (*appendTo)[base ] = (unsigned char)(0xf0 | (uchar & 0x07));
197 break;
198 }
199 /*
200 else if (uchar < 0x4000000) {
201 appendTo->setSize(base+5);
202 i = uchar & 0x3f;
203 (*appendTo)[base+4] = (unsigned char)(0x80 | i);
204 uchar >>= 6;
205
206 i = uchar & 0x3f;
207 (*appendTo)[base+3] = (unsigned char)(0x80 | i);
208 uchar >>= 6;
209
210 i = uchar & 0x3f;
211 (*appendTo)[base+2] = (unsigned char)(0x80 | i);
212 uchar >>= 6;
213
214 i = uchar & 0x3f;
215 (*appendTo)[base+1] = (unsigned char)(0x80 | i);
216 uchar >>= 6;
217
218 i = uchar & 0x03;
219 (*appendTo)[base] = (unsigned char)(0xf8 | i);
220 }
221 else if (uchar < 0x80000000) {
222 appendTo->setSize(base+6);
223 i = uchar & 0x3f;
224 (*appendTo)[base+5] = (unsigned char)(0x80 | i);
225 uchar >>= 6;
226
227 i = uchar & 0x3f;
228 (*appendTo)[base+4] = (unsigned char)(0x80 | i);
229 uchar >>= 6;
230
231 i = uchar & 0x3f;
232 (*appendTo)[base+3] = (unsigned char)(0x80 | i);
233 uchar >>= 6;
234
235 i = uchar & 0x3f;
236 (*appendTo)[base+2] = (unsigned char)(0x80 | i);
237 uchar >>= 6;
238
239 i = uchar & 0x3f;
240 (*appendTo)[base+1] = (unsigned char)(0x80 | i);
241 uchar >>= 6;
242
243 i = uchar & 0x01;
244 (*appendTo)[base] = (unsigned char)(0xfc | i);
245 }
246 */
247 return appendTo;
248 }
249
250
251 /******************************************************************************
252 * assureValidUTF8 - iterates the supplied UTF-8 buffer and checks for validity
253 * replacing invalid bytes if necessary and returning a
254 * verified UTF8 buffer, leaving the original input
255 * unchanged.
256 *
257 * ENT: buf - a utf8 buffer
258 *
259 * RET: input buffer validated and any problems fixed by substituting a
260 * replacement character for bytes not valid.
261 */
262 SWBuf assureValidUTF8(const char *buf);
263
264 /****
265 * This can be called to convert a UTF8 stream to an SWBuf which manages
266 * a wchar_t[]
267 * access buffer with (wchar_t *)SWBuf::getRawData();
268 *
269 */
270 SWBuf utf8ToWChar(const char *buf);
271
272 /****
273 * This can be called to convert a wchar_t[] to a UTF-8 SWBuf
274 *
275 */
276 SWBuf wcharToUTF8(const wchar_t *buf);
277
278
279
280 SWORD_NAMESPACE_END
281 #endif
282