1 // ut_string_class.h 2 // 3 // A simple string class for use where templates are not 4 // allowed. 5 // 6 #ifndef UT_STRING_CLASS_H 7 #define UT_STRING_CLASS_H 8 9 // 10 // Copyright (C) 2001 Mike Nordell <tamlin@algonet.se> 11 // Copyright (C) 2001 Dom Lachowicz <dominicl@seas.upenn.edu> 12 // Copyright (C) 2002 Tomas Frydrych <tomas@frydrych.uklinux.net> 13 // 14 // This class is free software; you can redistribute it and/or 15 // modify it under the terms of the GNU General Public License 16 // as published by the Free Software Foundation; either version 2 17 // of the License, or (at your option) any later version. 18 // 19 // This class is distributed in the hope that it will be useful, 20 // but WITHOUT ANY WARRANTY; without even the implied warranty of 21 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 // GNU General Public License for more details. 23 // 24 // You should have received a copy of the GNU General Public License 25 // along with this program; if not, write to the Free Software 26 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 27 // 02110-1301 USA. 28 // 29 30 #include <stdlib.h> 31 #include <stdarg.h> 32 33 #if defined(__MINGW32__) 34 # undef snprintf 35 # if __GNUC__ <= 3 36 # define _GLIBCXX_USE_C99_DYNAMIC 1 37 # endif 38 #endif 39 40 #include <string> 41 42 /* pre-emptive dismissal; ut_types.h is needed by just about everything, 43 * so even if it's commented out in-file that's still a lot of work for 44 * the preprocessor to do... 45 */ 46 #ifndef UT_TYPES_H 47 #include "ut_types.h" 48 #endif 49 #include "ut_string.h" 50 #include "ut_stringbuf.h" 51 52 // Forward declarations 53 class UT_ByteBuf; 54 class UT_UCS4_mbtowc; 55 class UT_String; 56 class UT_UTF8String; 57 class UT_UCS4String; 58 59 60 // yes, this is screaming for a template 61 62 //////////////////////////////////////////////////////////////////////// 63 // 64 // 8-bit string 65 // 66 // String is built of 8-bit units (bytes) 67 // Encoding could be any single-byte or multi-byte encoding 68 // 69 //////////////////////////////////////////////////////////////////////// 70 71 //! 72 // UT_String, a simple wrapper for zero terminated 'char' strings. 73 // 74 class ABI_EXPORT UT_String 75 { 76 public: 77 UT_String(); 78 UT_String(const char* sz, size_t n = 0 /* 0 == zero-terminate */); 79 UT_String(const UT_String& rhs); 80 UT_String(const std::basic_string<char> &s); 81 ~UT_String(); 82 83 size_t size() const; length()84 size_t length () const { return size () ; } 85 void reserve(size_t n); 86 bool empty() const; 87 void clear() const; 88 89 UT_String substr(size_t iStart, size_t nChars) const; 90 91 UT_String& operator=(const UT_String& rhs); 92 UT_String& operator=(const char* rhs); 93 UT_String& operator=(const std::basic_string<char> & rhs); 94 UT_String& operator+=(const UT_String& rhs); 95 UT_String& operator+=(const char* rhs); 96 UT_String& operator+=(char rhs); 97 98 char operator[](size_t iPos) const; 99 char& operator[](size_t iPos); 100 101 void swap(UT_String& rhs); 102 103 // The returned pointer is valid until the next non-const 104 // operation. You will _always_ get a legal pointer back, 105 // even if to an empty string. 106 const char* c_str() const; 107 108 private: 109 class UT_StringImpl<char>* pimpl; 110 }; 111 112 // helpers 113 ABI_EXPORT bool operator==(const UT_String& s1, const UT_String& s2); 114 ABI_EXPORT bool operator==(const UT_String& s1, const char* s2); 115 ABI_EXPORT bool operator==(const char* s1, const UT_String& s2); 116 ABI_EXPORT bool operator!=(const UT_String& s1, const UT_String& s2); 117 ABI_EXPORT bool operator!=(const UT_String& s1, const char* s2); 118 ABI_EXPORT bool operator!=(const char* s1, const UT_String& s2); 119 120 ABI_EXPORT UT_uint32 hashcode(const UT_String& string); 121 ABI_EXPORT UT_uint32 hashcode(const char *s); 122 123 // strcmp ordering 124 ABI_EXPORT bool operator<(const UT_String& s1, const UT_String& s2); 125 126 ABI_EXPORT UT_String operator+(const UT_String& s1, const UT_String& s2); 127 128 ABI_EXPORT size_t UT_String_findCh(const UT_String &st, char ch); 129 ABI_EXPORT size_t UT_String_findRCh(const UT_String &st, char ch); 130 131 /****************************************************************************/ 132 133 /*! 134 * Fill \inStr with the results of evaulating the printf formatted string 135 * \inFormat and return the reference to \inStr 136 */ 137 ABI_EXPORT UT_String& UT_String_sprintf(UT_String & inStr, const char * inFormat, ...) ABI_PRINTF_FORMAT(2,3); 138 ABI_EXPORT UT_String& UT_String_vprintf (UT_String & inStr, const char *format, 139 va_list args1) 140 ABI_PRINTF_FORMAT(2,0); 141 ABI_EXPORT UT_String& UT_String_vprintf (UT_String & inStr, const UT_String & format, 142 va_list args1); 143 144 /*! 145 * Returns a new UT_String object with the results of evaluating the printf 146 * formatted string \inFormat 147 */ 148 ABI_EXPORT UT_String UT_String_sprintf(const char * inFormat, ...) 149 ABI_PRINTF_FORMAT(1,2); 150 ABI_EXPORT UT_String UT_String_vprintf(const char * inFormat, va_list args1) 151 ABI_PRINTF_FORMAT(1,0); 152 ABI_EXPORT UT_String UT_String_vprintf(const UT_String & inFormat, va_list args1); 153 154 /***************************************************************************/ 155 156 /***************************************************************************/ 157 /*! 158 * Some functions to add/subtract and extract UT_String properties from a UT_String of properties. 159 */ 160 161 ABI_EXPORT UT_String UT_String_getPropVal(const UT_String & sPropertyString, const UT_String & sProp); 162 ABI_EXPORT void UT_String_removeProperty(UT_String & sPropertyString, const UT_String & sProp); 163 ABI_EXPORT void UT_String_setProperty(UT_String & sPropertyString, const UT_String &sProp, const UT_String & sVal); 164 ABI_EXPORT void UT_String_addPropertyString(UT_String & sPropertyString, const UT_String & sNewProp); 165 166 //////////////////////////////////////////////////////////////////////// 167 // 168 // UTF-8 string: encoding is *always* UTF-8 169 // 170 //////////////////////////////////////////////////////////////////////// 171 172 //! 173 // UT_UTF8String, a simple wrapper for zero terminated 'UTF-8' strings. 174 // 175 176 class ABI_EXPORT UT_UTF8String 177 { 178 public: 179 UT_UTF8String (); 180 UT_UTF8String (const char * sz, size_t n = 0 /* 0 == null-termination */); 181 UT_UTF8String (const char *sz, const char *encoding); 182 183 UT_UTF8String (const UT_UTF8String & rhs); 184 UT_UTF8String (const UT_UCS4String & rhs); 185 UT_UTF8String (const UT_UCSChar * sz, size_t n = 0 /* 0 == zero-terminate */); 186 187 ~UT_UTF8String (); 188 189 size_t size () const; length()190 size_t length () const { return size () ; } 191 192 void reserve(size_t n); 193 bool empty () const; 194 void clear () const; 195 size_t byteLength() const; 196 void dump(void) const; 197 UT_UTF8String substr(size_t iStart, size_t nChars) const; 198 199 UT_UTF8String & operator=(const char * rhs); 200 UT_UTF8String & operator=(const std::string & rhs); 201 UT_UTF8String & operator=(const UT_UTF8String & rhs); 202 UT_UTF8String & operator=(const UT_UCS4String & rhs); 203 204 UT_UTF8String & operator+=(const UT_UCS4Char rhs); 205 UT_UTF8String & operator+=(const char * rhs); 206 UT_UTF8String & operator+=(const std::string & rhs); 207 UT_UTF8String & operator+=(const UT_UTF8String & rhs); 208 UT_UTF8String & operator+=(const UT_UCS4String & rhs); 209 210 // The returned pointer is valid until the next non-const 211 // operation. You will _always_ get a legal pointer back, 212 // even if to an empty (0) string. 213 const char * utf8_str () const; 214 UT_UCS4String ucs4_str (); 215 216 void assign (const char * sz, size_t n = 0 /* 0 == null-termination */); 217 void append (const char * sz, size_t n = 0 /* 0 == null-termination */); 218 void appendBuf (const UT_ByteBuf & buf, UT_UCS4_mbtowc & converter); 219 220 void appendUCS4 (const UT_UCS4Char * sz, size_t n = 0 /* 0 == null-termination */); 221 void appendUCS2 (const UT_UCS2Char * sz, size_t n = 0 /* 0 == null-termination */); 222 223 const UT_UTF8String & escape (const UT_UTF8String & str1, 224 const UT_UTF8String & str2); // replaces <str1> with <str2> in the current string 225 const UT_UTF8String & escapeXML (); // escapes '<', '>', '"', & '&' in the current string 226 const UT_UTF8String & decodeXML (); // unescapes '<', '>', '"', & '&' in the current string 227 const UT_UTF8String & escapeMIME (); // translates the current string to MIME "quoted-printable" format 228 const UT_UTF8String & lowerCase (); // forces current string to lowercase 229 const UT_UTF8String & escapeURL (); // make URL confirm to RFC 1738 230 const UT_UTF8String & decodeURL (); 231 232 /* UTF8String - NOTES 233 * 234 * TODO: 235 * 1. Maybe have a search&replace function, something like: 236 * 237 * int replace (const char * utf_newstr, const char * utf_oldstr); 238 * 239 * which could be used to do substitutions, e.g.: 240 * 241 * UTF8String xmlstr = "expr: if ((c > 0) && (c < 0x80)) return c;"; 242 * xmlstr.replace ("<", "<"); 243 * xmlstr.replace (">", ">"); 244 * xmlstr.replace ("&","&"); 245 * 246 * MIQ: Note that for these replace methods, one might use ut_std_string/replace_all() 247 * 248 * 249 * getIterator: 250 * returns a home-made iterator associated with the UTF-8 string, e.g.: 251 * 252 * UTF8String str = "This is a UTF-8 string."; 253 * UT_UTF8Stringbuf::UTF8Iterator & iter = str.getIterator (); 254 * iter = iter.start (); // iter.start() returns 0 if no string, so: 255 * if (iter.current ()) 256 * { 257 * while (true) 258 * { 259 * char * pUTF = iter.current (); 260 * if (*pUTF == 0) break; // end-of-string 261 * // etc. 262 * iter.advance (); // or ++iter; 263 * } 264 * } 265 * 266 * The iterator will be well behaved provided the string is not being edited. 267 */ getIterator()268 UT_UTF8Stringbuf::UTF8Iterator getIterator () const 269 { 270 return UT_UTF8Stringbuf::UTF8Iterator(pimpl); 271 } 272 273 private: 274 class UT_UTF8Stringbuf * pimpl; 275 }; 276 277 ABI_EXPORT bool operator<(const UT_UTF8String& s1, const UT_UTF8String& s2); 278 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const UT_UTF8String& s2); 279 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const UT_UTF8String& s2); 280 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const char * s2); 281 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const char * s2); 282 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const std::string & s2); 283 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const std::string & s2); 284 ABI_EXPORT bool operator==(const std::string & s2, const UT_UTF8String& s1); 285 ABI_EXPORT bool operator!=(const std::string & s2, const UT_UTF8String& s1); 286 ABI_EXPORT UT_UTF8String operator+(const UT_UTF8String & s1, const UT_UTF8String & s2); 287 ABI_EXPORT UT_UTF8String UT_UTF8String_sprintf(const char * inFormat, ...); 288 ABI_EXPORT UT_UTF8String & UT_UTF8String_sprintf(UT_UTF8String & inStr, const char * inFormat, ...); 289 290 291 /***************************************************************************/ 292 /*! 293 * Some functions to add/subtract and extract UT_String properties from a UT_String of properties. 294 */ 295 296 ABI_EXPORT UT_UTF8String UT_UTF8String_getPropVal(const UT_UTF8String & sPropertyString, const UT_UTF8String & sProp); 297 298 ABI_EXPORT void UT_UTF8String_removeProperty(UT_UTF8String & sPropertyString, const UT_UTF8String & sProp); 299 300 ABI_EXPORT void UT_UTF8String_setProperty(UT_UTF8String & sPropertyString, const UT_UTF8String &sProp, const UT_UTF8String & sVal); 301 302 ABI_EXPORT void UT_UTF8String_addPropertyString(UT_UTF8String & sPropertyString, const UT_UTF8String & sNewProp); 303 304 ABI_EXPORT void UT_UTF8String_replaceString(UT_UTF8String & sString, const UT_UTF8String & sOldValue,const UT_UTF8String & sNewValue ); 305 306 //////////////////////////////////////////////////////////////////////// 307 // 308 // UCS-4 string 309 // 310 // String is built of 32-bit units (longs) 311 // 312 // NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference 313 // NOTE: in the case of UCS-4 and UTF-32 since they really are 314 // NOTE: identical 315 // 316 //////////////////////////////////////////////////////////////////////// 317 318 //! 319 // UT_UCS4String, a simple wrapper for zero terminated 'UCS4' strings. 320 // 321 322 // TODO: add c_str(), encoded_str(const char * to) 323 324 class ABI_EXPORT UT_UCS4String 325 { 326 public: 327 UT_UCS4String(); 328 UT_UCS4String(const UT_UCS4Char * sz, size_t n = 0 /* 0 == zero-terminate */); 329 UT_UCS4String(const UT_UCS4String& rhs); 330 331 /* construct from a string in UTF-8 format 332 */ 333 UT_UCS4String(const char * utf8_str, size_t bytelength = 0 /* 0 == zero-terminate */); 334 UT_UCS4String(const std::string & str /* zero-terminated utf-8 encoded */); 335 336 /* construct from a string in UTF-8 format 337 * if (strip_whitespace == true) replace all white space sequences with a single UCS_SPACE 338 * if (strip_whitespace != true) replace CR-LF & CR by LF 339 * non-breaking spaces ( UCS_NBSP 0x0a) are not white space; see UT_UCS4_isspace() 340 */ 341 UT_UCS4String(const char * utf8_str, size_t bytelength /* 0 == zero-terminate */, bool strip_whitespace); 342 343 ~UT_UCS4String(); 344 345 size_t size() const; length()346 size_t length () const { return size () ; } 347 348 void reserve(size_t n); 349 bool empty() const; 350 void clear() const; 351 352 UT_UCS4String substr(size_t iStart, size_t nChars) const; 353 UT_UCS4String substr(size_t iStart) const; 354 UT_UCS4String substr( const UT_UCS4Char* iter ) const; 355 356 UT_UCS4String& operator=(const UT_UCS4String& rhs); 357 UT_UCS4String& operator=(const UT_UCS4Char * rhs); 358 UT_UCS4String& operator+=(const UT_UCS4String& rhs); 359 UT_UCS4String& operator+=(const UT_UCS4Char * rhs); 360 UT_UCS4String& operator+=(UT_UCS4Char rhs); 361 UT_UCS4String& operator+=(char rhs); 362 UT_UCS4String& operator+=(unsigned char rhs); 363 364 UT_UCS4Char operator[](size_t iPos) const; 365 UT_UCS4Char& operator[](size_t iPos); 366 367 void swap(UT_UCS4String& rhs); 368 369 // The returned pointer is valid until the next non-const 370 // operation. You will _always_ get a legal pointer back, 371 // even if to an empty (0) string. 372 const UT_UCS4Char* ucs4_str() const; 373 374 // The same valid constraints as ucs4_str() applies to begin and end 375 const UT_UCS4Char* begin() const; 376 const UT_UCS4Char* end() const; 377 378 const char * utf8_str (); 379 380 private: 381 void _loadUtf8(const char * utf8_str, size_t bytelength); // implementation detail for the UTF-8 constructor 382 class UT_StringImpl<UT_UCS4Char>* pimpl; 383 }; 384 385 // helpers 386 bool operator==(const UT_UCS4String& s1, const UT_UCS4String& s2); 387 bool operator==(const UT_UCS4String& s1, const UT_UCS4Char * s2); 388 bool operator==(const UT_UCS4Char * s1, const UT_UCS4String& s2); 389 bool operator!=(const UT_UCS4String& s1, const UT_UCS4String& s2); 390 bool operator!=(const UT_UCS4String& s1, const UT_UCS4Char * s2); 391 bool operator!=(const UT_UCS4Char * s1, const UT_UCS4String& s2); 392 393 // strcmp ordering 394 bool operator<(const UT_UCS4String& s1, const UT_UCS4String& s2); 395 396 UT_UCS4String operator+(const UT_UCS4String& s1, const UT_UCS4String& s2); 397 398 399 400 #endif // UT_STRING_CLASS_H 401