1 // ut_string_class.h
2 //
3 // A simple string class for use where templates are not
4 // allowed.
5 //
6 #ifndef UT_STRING_CLASS_H
7 #define UT_STRING_CLASS_H
8 
9 //
10 // Copyright (C) 2001 Mike Nordell <tamlin@algonet.se>
11 // Copyright (C) 2001 Dom Lachowicz <dominicl@seas.upenn.edu>
12 // Copyright (C) 2002 Tomas Frydrych <tomas@frydrych.uklinux.net>
13 //
14 // This class is free software; you can redistribute it and/or
15 // modify it under the terms of the GNU General Public License
16 // as published by the Free Software Foundation; either version 2
17 // of the License, or (at your option) any later version.
18 //
19 // This class is distributed in the hope that it will be useful,
20 // but WITHOUT ANY WARRANTY; without even the implied warranty of
21 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 // GNU General Public License for more details.
23 //
24 // You should have received a copy of the GNU General Public License
25 // along with this program; if not, write to the Free Software
26 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
27 // 02110-1301 USA.
28 //
29 
30 #include <stdlib.h>
31 #include <stdarg.h>
32 
33 #if defined(__MINGW32__)
34 #  undef snprintf
35 #  if __GNUC__ <= 3
36 #    define _GLIBCXX_USE_C99_DYNAMIC 1
37 #  endif
38 #endif
39 
40 #include <string>
41 
42 /* pre-emptive dismissal; ut_types.h is needed by just about everything,
43  * so even if it's commented out in-file that's still a lot of work for
44  * the preprocessor to do...
45  */
46 #ifndef UT_TYPES_H
47 #include "ut_types.h"
48 #endif
49 #include "ut_string.h"
50 #include "ut_stringbuf.h"
51 
52 // Forward declarations
53 class UT_ByteBuf;
54 class UT_UCS4_mbtowc;
55 class UT_String;
56 class UT_UTF8String;
57 class UT_UCS4String;
58 
59 
60 // yes, this is screaming for a template
61 
62 ////////////////////////////////////////////////////////////////////////
63 //
64 //  8-bit string
65 //
66 //  String is built of 8-bit units (bytes)
67 //  Encoding could be any single-byte or multi-byte encoding
68 //
69 ////////////////////////////////////////////////////////////////////////
70 
71 //!
72 //	UT_String, a simple wrapper for zero terminated 'char' strings.
73 //
74 class ABI_EXPORT UT_String
75 {
76 public:
77 	UT_String();
78 	UT_String(const char* sz, size_t n = 0 /* 0 == zero-terminate */);
79 	UT_String(const UT_String& rhs);
80 	UT_String(const std::basic_string<char> &s);
81 	~UT_String();
82 
83 	size_t		size() const;
length()84 	size_t length () const { return size () ; }
85 	void            reserve(size_t n);
86 	bool		empty() const;
87 	void        clear() const;
88 
89 	UT_String	substr(size_t iStart, size_t nChars) const;
90 
91 	UT_String&	operator=(const UT_String& rhs);
92 	UT_String&	operator=(const char*      rhs);
93 	UT_String&	operator=(const std::basic_string<char> & rhs);
94 	UT_String&	operator+=(const UT_String& rhs);
95 	UT_String&	operator+=(const char*      rhs);
96 	UT_String&  operator+=(char rhs);
97 
98 	char		operator[](size_t iPos) const;
99 	char&		operator[](size_t iPos);
100 
101 	void		swap(UT_String& rhs);
102 
103 	// The returned pointer is valid until the next non-const
104 	// operation. You will _always_ get a legal pointer back,
105 	// even if to an empty string.
106 	const char* c_str() const;
107 
108 private:
109 	class UT_StringImpl<char>* pimpl;
110 };
111 
112 // helpers
113 ABI_EXPORT bool operator==(const UT_String& s1, const UT_String& s2);
114 ABI_EXPORT bool operator==(const UT_String& s1, const char*      s2);
115 ABI_EXPORT bool operator==(const char*      s1, const UT_String& s2);
116 ABI_EXPORT bool operator!=(const UT_String& s1, const UT_String& s2);
117 ABI_EXPORT bool operator!=(const UT_String& s1, const char*      s2);
118 ABI_EXPORT bool operator!=(const char*      s1, const UT_String& s2);
119 
120 ABI_EXPORT UT_uint32 hashcode(const UT_String& string);
121 ABI_EXPORT UT_uint32 hashcode(const char *s);
122 
123 // strcmp ordering
124 ABI_EXPORT bool operator<(const UT_String& s1, const UT_String& s2);
125 
126 ABI_EXPORT UT_String operator+(const UT_String& s1, const UT_String& s2);
127 
128 ABI_EXPORT size_t UT_String_findCh(const UT_String &st, char ch);
129 ABI_EXPORT size_t UT_String_findRCh(const UT_String &st, char ch);
130 
131 /****************************************************************************/
132 
133 /*!
134  * Fill \inStr with the results of evaulating the printf formatted string
135  * \inFormat and return the reference to \inStr
136  */
137 ABI_EXPORT UT_String& UT_String_sprintf(UT_String & inStr, const char * inFormat, ...) ABI_PRINTF_FORMAT(2,3);
138 ABI_EXPORT UT_String& UT_String_vprintf (UT_String & inStr, const char *format,
139                                          va_list      args1)
140     ABI_PRINTF_FORMAT(2,0);
141 ABI_EXPORT UT_String& UT_String_vprintf (UT_String & inStr, const UT_String & format,
142 					 va_list      args1);
143 
144 /*!
145  * Returns a new UT_String object with the results of evaluating the printf
146  * formatted string \inFormat
147  */
148 ABI_EXPORT UT_String UT_String_sprintf(const char * inFormat, ...)
149     ABI_PRINTF_FORMAT(1,2);
150 ABI_EXPORT UT_String UT_String_vprintf(const char * inFormat, va_list args1)
151     ABI_PRINTF_FORMAT(1,0);
152 ABI_EXPORT UT_String UT_String_vprintf(const UT_String & inFormat, va_list args1);
153 
154 /***************************************************************************/
155 
156 /***************************************************************************/
157 /*!
158  * Some functions to add/subtract and extract UT_String properties from a UT_String of properties.
159  */
160 
161 ABI_EXPORT UT_String UT_String_getPropVal(const UT_String & sPropertyString, const UT_String & sProp);
162 ABI_EXPORT void UT_String_removeProperty(UT_String & sPropertyString, const UT_String & sProp);
163 ABI_EXPORT void UT_String_setProperty(UT_String & sPropertyString, const UT_String &sProp, const UT_String & sVal);
164 ABI_EXPORT void UT_String_addPropertyString(UT_String & sPropertyString, const UT_String & sNewProp);
165 
166 ////////////////////////////////////////////////////////////////////////
167 //
168 //  UTF-8 string: encoding is *always* UTF-8
169 //
170 ////////////////////////////////////////////////////////////////////////
171 
172 //!
173 //	UT_UTF8String, a simple wrapper for zero terminated 'UTF-8' strings.
174 //
175 
176 class ABI_EXPORT UT_UTF8String
177 {
178 public:
179 	UT_UTF8String ();
180 	UT_UTF8String (const char * sz, size_t n = 0 /* 0 == null-termination */);
181 	UT_UTF8String (const char *sz, const char *encoding);
182 
183 	UT_UTF8String (const UT_UTF8String & rhs);
184 	UT_UTF8String (const UT_UCS4String & rhs);
185 	UT_UTF8String (const UT_UCSChar * sz, size_t n = 0 /* 0 == zero-terminate */);
186 
187 	~UT_UTF8String ();
188 
189 	size_t		size () const;
length()190 	size_t length () const { return size () ; }
191 
192 	void            reserve(size_t n);
193 	bool		empty () const;
194 	void		clear () const;
195 	size_t		byteLength() const;
196 	void        dump(void) const;
197 	UT_UTF8String	substr(size_t iStart, size_t nChars) const;
198 
199 	UT_UTF8String &	operator=(const char *          rhs);
200 	UT_UTF8String &	operator=(const std::string &   rhs);
201 	UT_UTF8String &	operator=(const UT_UTF8String & rhs);
202 	UT_UTF8String &	operator=(const UT_UCS4String & rhs);
203 
204 	UT_UTF8String &	operator+=(const UT_UCS4Char     rhs);
205 	UT_UTF8String &	operator+=(const char *          rhs);
206 	UT_UTF8String &	operator+=(const std::string &   rhs);
207 	UT_UTF8String &	operator+=(const UT_UTF8String & rhs);
208 	UT_UTF8String &	operator+=(const UT_UCS4String & rhs);
209 
210 	// The returned pointer is valid until the next non-const
211 	// operation. You will _always_ get a legal pointer back,
212 	// even if to an empty (0) string.
213 	const char * utf8_str () const;
214 	UT_UCS4String ucs4_str ();
215 
216 	void		assign (const char * sz, size_t n = 0 /* 0 == null-termination */);
217 	void		append (const char * sz, size_t n = 0 /* 0 == null-termination */);
218 	void        appendBuf (const UT_ByteBuf & buf, UT_UCS4_mbtowc & converter);
219 
220 	void		appendUCS4 (const UT_UCS4Char * sz, size_t n = 0 /* 0 == null-termination */);
221 	void		appendUCS2 (const UT_UCS2Char * sz, size_t n = 0 /* 0 == null-termination */);
222 
223 	const UT_UTF8String & escape (const UT_UTF8String & str1,
224 				      const UT_UTF8String & str2);  // replaces <str1> with <str2> in the current string
225 	const UT_UTF8String & escapeXML ();  // escapes '<', '>', '"', & '&' in the current string
226 	const UT_UTF8String & decodeXML ();  // unescapes '<', '>', '"', & '&' in the current string
227 	const UT_UTF8String & escapeMIME (); // translates the current string to MIME "quoted-printable" format
228 	const UT_UTF8String & lowerCase ();  // forces current string to lowercase
229 	const UT_UTF8String & escapeURL ();  // make URL confirm to RFC 1738
230 	const UT_UTF8String & decodeURL ();
231 
232 	/* UTF8String - NOTES
233 	 *
234 	 * TODO:
235 	 * 1. Maybe have a search&replace function, something like:
236 	 *
237 	 * 	int replace (const char * utf_newstr, const char * utf_oldstr);
238 	 *
239 	 *    which could be used to do substitutions, e.g.:
240 	 *
241 	 * 	UTF8String xmlstr = "expr: if ((c > 0) && (c < 0x80)) return c;";
242 	 * 	xmlstr.replace ("&lt;", "<");
243 	 * 	xmlstr.replace ("&gt;", ">");
244 	 * 	xmlstr.replace ("&amp;","&");
245 	 *
246 	 * 	MIQ: Note that for these replace methods, one might use ut_std_string/replace_all()
247 	 *
248 	 *
249 	 * getIterator:
250 	 * returns a home-made iterator associated with the UTF-8 string, e.g.:
251 	 *
252 	 * 	UTF8String str = "This is a UTF-8 string.";
253 	 * 	UT_UTF8Stringbuf::UTF8Iterator & iter = str.getIterator ();
254 	 * 	iter = iter.start (); // iter.start() returns 0 if no string, so:
255 	 * 	if (iter.current ())
256 	 * 	{
257 	 * 		while (true)
258 	 * 		{
259 	 * 			char * pUTF = iter.current ();
260 	 * 			if (*pUTF == 0) break; // end-of-string
261 	 * 			// etc.
262 	 * 			iter.advance (); // or ++iter;
263 	 * 		}
264 	 * 	}
265 	 *
266 	 * The iterator will be well behaved provided the string is not being edited.
267 	 */
getIterator()268 	UT_UTF8Stringbuf::UTF8Iterator getIterator () const
269 	{
270 		return UT_UTF8Stringbuf::UTF8Iterator(pimpl);
271 	}
272 
273 private:
274 	class UT_UTF8Stringbuf * pimpl;
275 };
276 
277 ABI_EXPORT bool operator<(const UT_UTF8String& s1, const UT_UTF8String& s2);
278 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const UT_UTF8String& s2);
279 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const UT_UTF8String& s2);
280 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const char * s2);
281 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const char * s2);
282 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const std::string & s2);
283 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const std::string & s2);
284 ABI_EXPORT bool operator==(const std::string & s2, const UT_UTF8String& s1);
285 ABI_EXPORT bool operator!=(const std::string & s2, const UT_UTF8String& s1);
286 ABI_EXPORT UT_UTF8String operator+(const UT_UTF8String & s1, const UT_UTF8String & s2);
287 ABI_EXPORT UT_UTF8String UT_UTF8String_sprintf(const char * inFormat, ...);
288 ABI_EXPORT UT_UTF8String & UT_UTF8String_sprintf(UT_UTF8String & inStr, const char * inFormat, ...);
289 
290 
291 /***************************************************************************/
292 /*!
293  * Some functions to add/subtract and extract UT_String properties from a UT_String of properties.
294  */
295 
296 ABI_EXPORT UT_UTF8String UT_UTF8String_getPropVal(const UT_UTF8String & sPropertyString, const UT_UTF8String & sProp);
297 
298 ABI_EXPORT void UT_UTF8String_removeProperty(UT_UTF8String & sPropertyString, const UT_UTF8String & sProp);
299 
300 ABI_EXPORT void UT_UTF8String_setProperty(UT_UTF8String & sPropertyString, const UT_UTF8String &sProp, const UT_UTF8String & sVal);
301 
302 ABI_EXPORT void UT_UTF8String_addPropertyString(UT_UTF8String & sPropertyString, const UT_UTF8String & sNewProp);
303 
304 ABI_EXPORT void UT_UTF8String_replaceString(UT_UTF8String & sString, const UT_UTF8String & sOldValue,const UT_UTF8String & sNewValue );
305 
306 ////////////////////////////////////////////////////////////////////////
307 //
308 //  UCS-4 string
309 //
310 //  String is built of 32-bit units (longs)
311 //
312 //  NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference
313 //  NOTE:  in the case of UCS-4 and UTF-32 since they really are
314 //  NOTE:  identical
315 //
316 ////////////////////////////////////////////////////////////////////////
317 
318 //!
319 //	UT_UCS4String, a simple wrapper for zero terminated 'UCS4' strings.
320 //
321 
322 // TODO: add c_str(), encoded_str(const char * to)
323 
324 class ABI_EXPORT UT_UCS4String
325 {
326 public:
327 	UT_UCS4String();
328 	UT_UCS4String(const UT_UCS4Char * sz, size_t n = 0 /* 0 == zero-terminate */);
329 	UT_UCS4String(const UT_UCS4String& rhs);
330 
331 	/* construct from a string in UTF-8 format
332 	 */
333 	UT_UCS4String(const char * utf8_str, size_t bytelength = 0 /* 0 == zero-terminate */);
334 	UT_UCS4String(const std::string & str /* zero-terminated utf-8 encoded */);
335 
336 	/* construct from a string in UTF-8 format
337 	 * if (strip_whitespace == true) replace all white space sequences with a single UCS_SPACE
338 	 * if (strip_whitespace != true) replace CR-LF & CR by LF
339 	 * non-breaking spaces (&nbsp; UCS_NBSP 0x0a) are not white space; see UT_UCS4_isspace()
340 	 */
341 	UT_UCS4String(const char * utf8_str, size_t bytelength /* 0 == zero-terminate */, bool strip_whitespace);
342 
343 	~UT_UCS4String();
344 
345 	size_t	size() const;
length()346 	size_t length () const { return size () ; }
347 
348 	void            reserve(size_t n);
349 	bool		empty() const;
350 	void        clear() const;
351 
352 	UT_UCS4String	substr(size_t iStart, size_t nChars) const;
353 	UT_UCS4String	substr(size_t iStart) const;
354 	UT_UCS4String	substr( const UT_UCS4Char* iter ) const;
355 
356 	UT_UCS4String&	operator=(const UT_UCS4String&  rhs);
357 	UT_UCS4String&	operator=(const UT_UCS4Char *   rhs);
358 	UT_UCS4String&	operator+=(const UT_UCS4String& rhs);
359 	UT_UCS4String&	operator+=(const UT_UCS4Char *  rhs);
360 	UT_UCS4String&  operator+=(UT_UCS4Char rhs);
361 	UT_UCS4String&  operator+=(char rhs);
362 	UT_UCS4String&  operator+=(unsigned char rhs);
363 
364 	UT_UCS4Char		operator[](size_t iPos) const;
365 	UT_UCS4Char&	operator[](size_t iPos);
366 
367 	void		swap(UT_UCS4String& rhs);
368 
369 	// The returned pointer is valid until the next non-const
370 	// operation. You will _always_ get a legal pointer back,
371 	// even if to an empty (0) string.
372 	const UT_UCS4Char* ucs4_str() const;
373 
374     // The same valid constraints as ucs4_str() applies to begin and end
375     const UT_UCS4Char* begin() const;
376     const UT_UCS4Char* end()   const;
377 
378 	const char * utf8_str ();
379 
380 private:
381 	void _loadUtf8(const char * utf8_str, size_t bytelength); // implementation detail for the UTF-8 constructor
382 	class UT_StringImpl<UT_UCS4Char>* pimpl;
383 };
384 
385 // helpers
386 bool operator==(const UT_UCS4String& s1, const UT_UCS4String& s2);
387 bool operator==(const UT_UCS4String& s1, const UT_UCS4Char *  s2);
388 bool operator==(const UT_UCS4Char *  s1, const UT_UCS4String& s2);
389 bool operator!=(const UT_UCS4String& s1, const UT_UCS4String& s2);
390 bool operator!=(const UT_UCS4String& s1, const UT_UCS4Char *  s2);
391 bool operator!=(const UT_UCS4Char *  s1, const UT_UCS4String& s2);
392 
393 // strcmp ordering
394 bool operator<(const UT_UCS4String& s1, const UT_UCS4String& s2);
395 
396 UT_UCS4String operator+(const UT_UCS4String& s1, const UT_UCS4String& s2);
397 
398 
399 
400 #endif	// UT_STRING_CLASS_H
401