1 /******************************************************************************
2  *
3  *  utilstr.h -	prototypes for string utility functions
4  *
5  * $Id: utilstr.h 3515 2017-11-01 11:38:09Z scribe $
6  *
7  * Copyright 1997-2013 CrossWire Bible Society (http://www.crosswire.org)
8  *	CrossWire Bible Society
9  *	P. O. Box 2528
10  *	Tempe, AZ  85280-2528
11  *
12  * This program is free software; you can redistribute it and/or modify it
13  * under the terms of the GNU General Public License as published by the
14  * Free Software Foundation version 2.
15  *
16  * This program is distributed in the hope that it will be useful, but
17  * WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  */
22 
23 #ifndef UTILSTR_H
24 #define UTILSTR_H
25 
26 #include <defs.h>
27 #include <sysdata.h>
28 #include <swbuf.h>
29 
30 SWORD_NAMESPACE_START
31 
32 
33 /******************************************************************************
34  * stdstr - clones a string
35  *
36  * ENT:	ipstr	- pointer to a string pointer to set if necessary
37  *	istr	- string to set to *ipstr
38  *			0 - only get
39  *
40  * RET:	*ipstr
41  */
42 
43 inline char *stdstr(char **ipstr, const char *istr, unsigned int memPadFactor = 1) {
44 	if (*ipstr)
45 		delete [] *ipstr;
46 	if (istr) {
47 		int len = (int)strlen(istr) + 1;
48 		*ipstr = new char [ len * memPadFactor ];
49 		memcpy(*ipstr, istr, len);
50 	}
51 	else *ipstr = 0;
52 	return *ipstr;
53 }
54 
55 SWDLLEXPORT char *strstrip (char *istr);
56 SWDLLEXPORT const char *stristr (const char *s1, const char *s2);
57 SWDLLEXPORT int strnicmp(const char *s1, const char *s2, int len);
58 SWDLLEXPORT int stricmp(const char *s1, const char *s2);
59 
60 /******************************************************************************
61  * SW_toupper - array of uppercase values for any given Latin-1 value
62  *
63  * use this instead of toupper() for fast lookups on accented characters
64  */
65 extern const unsigned char SW_toupper_array[256];
66 #define SW_toupper(c) SW_toupper_array[(unsigned char)c]
67 
68 /******************************************************************************
69  * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
70  * 					and increments buf to start of next codepoint
71  *
72  * ENT:	buf - address of a utf8 buffer
73  *
74  * RET:	buf - incremented past last byte used in computing the current codepoint
75  * 		unicode codepoint value (0 with buf incremented is invalid UTF8 byte
76  */
77 
78 
79 /******************************************************************************
80  * getUniCharFromUTF8 - retrieves the next Unicode codepoint from a UTF8 string
81  * 					and increments buf to start of next codepoint
82  *
83  * ENT:	buf - address of a utf8 buffer
84  *
85  * RET:	buf - incremented past last byte used in computing the current codepoint
86  * 		unicode codepoint value (0 with buf incremented is invalid UTF8 byte
87  */
88 
89 inline __u32 getUniCharFromUTF8(const unsigned char **buf, bool skipValidation = false) {
90 	__u32 ch = 0;
91 
92 	//case: We're at the end
93 	if (!(**buf)) {
94 		return ch;
95 	}
96 
97 	//case: ANSI
98 	if (!(**buf & 128)) {
99 		ch = **buf;
100 		(*buf)++;
101 		return ch;
102 	}
103 
104 	//case: Invalid UTF-8 (illegal continuing byte in initial position)
105 	if ((**buf >> 6) == 2) {
106 		(*buf)++;
107 		return ch;
108 	}
109 
110 
111 	//case: 2+ byte codepoint
112 	int subsequent = 1;
113 	if ((**buf & 32) == 0) { subsequent = 1; }
114 	else if ((**buf & 16) == 0) { subsequent = 2; }
115 	else if ((**buf &  8) == 0) { subsequent = 3; }
116 	else if ((**buf &  4) == 0) { subsequent = 4; }
117 	else if ((**buf &  2) == 0) { subsequent = 5; }
118 	else if ((**buf &  1) == 0) { subsequent = 6; }
119 	else subsequent = 7; // is this legal?
120 
121 	ch = **buf & (0xFF>>(subsequent + 1));
122 
123 	for (int i = 1; i <= subsequent; ++i) {
124 		// subsequent byte did not begin with 10XXXXXX
125 		// move our buffer to here and error out
126 		// this also catches our null if we hit the string terminator
127 		if (((*buf)[i] >> 6) != 2) {
128 			*buf += i;
129 			return 0;
130 		}
131 		ch <<= 6;
132 		ch |= (*buf)[i] & 63;
133 	}
134 	*buf += (subsequent+1);
135 
136 	if (!skipValidation) {
137 		// I THINK THIS IS STUPID BUT THE SPEC SAYS NO MORE THAN 4 BYTES
138 		if (subsequent > 3) ch = 0;
139 		// AGAIN stupid, but spec says UTF-8 can't use more than 21 bits
140 		if (ch > 0x1FFFFF) ch = 0;
141 		// This would be out of Unicode bounds
142 		if (ch > 0x10FFFF) ch = 0;
143 		// these would be values which could be represented in less bytes
144 		if (ch < 0x80 && subsequent > 0) ch = 0;
145 		if (ch < 0x800 && subsequent > 1) ch = 0;
146 		if (ch < 0x10000 && subsequent > 2) ch = 0;
147 		if (ch < 0x200000 && subsequent > 3) ch = 0;
148 	}
149 
150 	return ch;
151 }
152 
153 
154 /******************************************************************************
155  * getUTF8FromUniChar - retrieves us UTF8 string from a
156  * 					Unicode codepoint
157  *
158  * ENT:	uchar - unicode codepoint value
159  *
160  * RET:	buf - a UTF8 string which consists of the proper UTF8 sequence of
161  * 				bytes for the given Unicode codepoint
162  * NOTE: for speed and thread safety, this method now requires a buffer
163  * 		to work with
164  */
165 
getUTF8FromUniChar(__u32 uchar,SWBuf * appendTo)166 inline SWBuf *getUTF8FromUniChar(__u32 uchar, SWBuf *appendTo) {
167 	unsigned long base = appendTo->size();
168 
169 	// This would be out of Unicode bounds
170 	if (uchar > 0x10FFFF) uchar = 0xFFFD;
171 	char bytes = uchar < 0x80 ? 1 : uchar < 0x800 ? 2 : uchar < 0x10000 ? 3 : 4;
172 	appendTo->setSize(base+bytes);
173 	switch (bytes) {
174 	case 1:
175 		(*appendTo)[base  ] = (unsigned char)uchar;
176 		break;
177 	case 2:
178 		(*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
179 		uchar >>= 6;
180 		(*appendTo)[base  ] = (unsigned char)(0xc0 | (uchar & 0x1f));
181 		break;
182 	case 3:
183 		(*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
184 		uchar >>= 6;
185 		(*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
186 		uchar >>= 6;
187 		(*appendTo)[base  ] = (unsigned char)(0xe0 | (uchar & 0x0f));
188 		break;
189 	case 4:
190 		(*appendTo)[base+3] = (unsigned char)(0x80 | (uchar & 0x3f));
191 		uchar >>= 6;
192 		(*appendTo)[base+2] = (unsigned char)(0x80 | (uchar & 0x3f));
193 		uchar >>= 6;
194 		(*appendTo)[base+1] = (unsigned char)(0x80 | (uchar & 0x3f));
195 		uchar >>= 6;
196 		(*appendTo)[base  ] = (unsigned char)(0xf0 | (uchar & 0x07));
197 		break;
198 	}
199 /*
200 	else if (uchar < 0x4000000) {
201 		appendTo->setSize(base+5);
202 		i = uchar & 0x3f;
203 		(*appendTo)[base+4] = (unsigned char)(0x80 | i);
204 		uchar >>= 6;
205 
206 		i = uchar & 0x3f;
207 		(*appendTo)[base+3] = (unsigned char)(0x80 | i);
208 		uchar >>= 6;
209 
210 		i = uchar & 0x3f;
211 		(*appendTo)[base+2] = (unsigned char)(0x80 | i);
212 		uchar >>= 6;
213 
214 		i = uchar & 0x3f;
215 		(*appendTo)[base+1] = (unsigned char)(0x80 | i);
216 		uchar >>= 6;
217 
218 		i = uchar & 0x03;
219 		(*appendTo)[base] = (unsigned char)(0xf8 | i);
220 	}
221 	else if (uchar < 0x80000000) {
222 		appendTo->setSize(base+6);
223 		i = uchar & 0x3f;
224 		(*appendTo)[base+5] = (unsigned char)(0x80 | i);
225 		uchar >>= 6;
226 
227 		i = uchar & 0x3f;
228 		(*appendTo)[base+4] = (unsigned char)(0x80 | i);
229 		uchar >>= 6;
230 
231 		i = uchar & 0x3f;
232 		(*appendTo)[base+3] = (unsigned char)(0x80 | i);
233 		uchar >>= 6;
234 
235 		i = uchar & 0x3f;
236 		(*appendTo)[base+2] = (unsigned char)(0x80 | i);
237 		uchar >>= 6;
238 
239 		i = uchar & 0x3f;
240 		(*appendTo)[base+1] = (unsigned char)(0x80 | i);
241 		uchar >>= 6;
242 
243 		i = uchar & 0x01;
244 		(*appendTo)[base] = (unsigned char)(0xfc | i);
245 	}
246 */
247 	return appendTo;
248 }
249 
250 
251 /******************************************************************************
252  * assureValidUTF8 - iterates the supplied UTF-8 buffer and checks for validity
253  * 					replacing invalid bytes if necessary and returning a
254  *					verified UTF8 buffer, leaving the original input
255  *					unchanged.
256  *
257  * ENT:	buf - a utf8 buffer
258  *
259  * RET:	input buffer validated and any problems fixed by substituting a
260  * 		replacement character for bytes not valid.
261  */
262 SWBuf assureValidUTF8(const char *buf);
263 
264 /****
265  * This can be called to convert a UTF8 stream to an SWBuf which manages
266  *	a wchar_t[]
267  *	access buffer with (wchar_t *)SWBuf::getRawData();
268  *
269  */
270 SWBuf utf8ToWChar(const char *buf);
271 
272 /****
273  * This can be called to convert a wchar_t[] to a UTF-8 SWBuf
274  *
275  */
276 SWBuf wcharToUTF8(const wchar_t *buf);
277 
278 
279 
280 SWORD_NAMESPACE_END
281 #endif
282