1 /* 2 * This program source code file is part of KiCad, a free EDA CAD application. 3 * 4 * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com> 5 * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors. 6 * 7 * @author Dick Hollenbeck 8 * 9 * This program is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License 11 * as published by the Free Software Foundation; either version 2 12 * of the License, or (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, you may find one here: 21 * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html 22 * or you may search the http://www.gnu.org website for the version 2 license, 23 * or you may write to the Free Software Foundation, Inc., 24 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 25 */ 26 27 #ifndef UTF8_H_ 28 #define UTF8_H_ 29 30 #include <string> 31 #include <wx/string.h> 32 33 #if defined(DEBUG) 34 #define UTF8_VERIFY // Might someday be a hidden cmake config option 35 #endif 36 37 38 /** 39 * Test a C string to see if it is UTF8 encoded. 40 * 41 * An ASCII string is a valid UTF8 string. 42 */ 43 bool IsUTF8( const char* aString ); 44 45 46 #if defined(UTF8_VERIFY) 47 #define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) ) 48 #else 49 #define MAYBE_VERIFY_UTF8(x) // nothing 50 #endif 51 52 53 /** 54 * An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion 55 * support to and from wxString, to and from std::string, and has non-mutating iteration 56 * over Unicode characters. 57 * 58 * I've been careful to supply only conversion facilities and not try and duplicate 59 * wxString() with many member functions. There are multiple ways to create text into 60 * a std::string without the need of too many member functions: 61 * 62 * - richio.h's StrPrintf(). 63 * - std::ostringstream. 64 * 65 * Because this class uses no virtuals, it should be possible to cast any std::string 66 * into a UTF8 using this kind of cast: (UTF8 &) without construction or copying being 67 * the effect of the cast. Be sure the source std::string holds UTF8 encoded text before 68 * you do that. 69 */ 70 class UTF8 71 { 72 public: 73 UTF8( const wxString& o ); 74 75 /// This is a constructor for which you could end up with 76 /// non-UTF8 encoding, but that would be your fault. UTF8(const char * txt)77 UTF8( const char* txt ) : 78 m_s( txt ) 79 { 80 MAYBE_VERIFY_UTF8( c_str() ); 81 } 82 83 /// For use with _() function on wx 2.8. 84 /// BTW _() on wx >= 2.9 returns wxString, not wchar_t* like on 2.8. 85 UTF8( const wchar_t* txt ); 86 UTF8(const std::string & o)87 UTF8( const std::string& o ) : 88 m_s( o ) 89 { 90 MAYBE_VERIFY_UTF8( c_str() ); 91 } 92 UTF8()93 UTF8() 94 { 95 } 96 ~UTF8()97 ~UTF8() // Needed mainly to build python wrapper 98 { 99 } 100 101 // expose some std::string functions publicly, since base class must be private. c_str()102 const char* c_str() const { return m_s.c_str(); } empty()103 bool empty() const { return m_s.empty(); } 104 find(char c)105 std::string::size_type find( char c ) const { return m_s.find( c ); } find(char c,size_t s)106 std::string::size_type find( char c, size_t s ) const { return m_s.find( c, s ); } 107 clear()108 void clear() { m_s.clear(); } length()109 std::string::size_type length() const { return m_s.length(); } size()110 std::string::size_type size() const { return m_s.size(); } compare(const std::string & s)111 int compare( const std::string& s ) const { return m_s.compare( s ); } 112 113 bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; } 114 bool operator==( const std::string& rhs ) const { return m_s == rhs; } 115 bool operator==( const char* s ) const { return m_s == s; } 116 117 std::string::size_type find_first_of( const std::string& str, 118 std::string::size_type pos = 0 ) const 119 { 120 return m_s.find_first_of( str, pos ); 121 } 122 123 UTF8& operator+=( const UTF8& str ) 124 { 125 m_s += str.m_s; 126 MAYBE_VERIFY_UTF8( c_str() ); 127 return *this; 128 } 129 130 UTF8& operator+=( char ch ) 131 { 132 m_s.operator+=( ch ); 133 MAYBE_VERIFY_UTF8( c_str() ); 134 return *this; 135 } 136 137 UTF8& operator+=( const char* s ) 138 { 139 m_s.operator+=( s ); 140 MAYBE_VERIFY_UTF8( c_str() ); 141 return *this; 142 } 143 144 /// Append a wide (unicode) char to the UTF8 string. 145 /// if this wide char is not a ASCII7 char, it will be added as a UTF8 multibyte sequence 146 /// @param w_ch is a UTF-16 value (can be a UTF-32 on Linux) 147 UTF8& operator+=( unsigned w_ch ); 148 149 // std::string::npos is not constexpr, so we can't use it in an 150 // initializer. 151 static constexpr std::string::size_type npos = -1; 152 153 UTF8& operator=( const wxString& o ); 154 155 UTF8& operator=( const std::string& o ) 156 { 157 m_s = o; 158 MAYBE_VERIFY_UTF8( c_str() ); 159 return *this; 160 } 161 162 UTF8& operator=( const char* s ) 163 { 164 m_s = s; 165 MAYBE_VERIFY_UTF8( c_str() ); 166 return *this; 167 } 168 169 UTF8& operator=( char c ) 170 { 171 m_s = c; 172 MAYBE_VERIFY_UTF8( c_str() ); 173 return *this; 174 } 175 176 // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character 177 // was split, so return std::string not UTF8 178 std::string substr( size_t pos = 0, size_t len = npos ) const 179 { 180 return m_s.substr( pos, len ); 181 } 182 183 operator const std::string& () const { return m_s; } 184 //operator std::string& () { return m_s; } 185 //operator std::string () const { return m_s; } 186 187 wxString wx_str() const; 188 operator wxString () const; 189 190 // "Read only" iterating over bytes is done with these, use the uni_iter to iterate 191 // over UTF8 (multi-byte) characters begin()192 std::string::const_iterator begin() const { return m_s.begin(); } end()193 std::string::const_iterator end() const { return m_s.end(); } 194 195 #ifndef SWIG 196 /** 197 * uni_iter 198 * is a non-mutating iterator that walks through unicode code points in the UTF8 encoded 199 * string. The normal ++(), ++(int), ->(), and *() operators are all supported 200 * for read only access and some return an unsigned holding the unicode character 201 * appropriate for the respective operator. 202 */ 203 class uni_iter 204 { 205 public: uni_iter()206 uni_iter() // Needed only to build python wrapper, not used outside the wrapper 207 { 208 it = nullptr; 209 } 210 uni_iter(const uni_iter & o)211 uni_iter( const uni_iter& o ) 212 { 213 it = o.it; 214 } 215 216 /// pre-increment and return uni_iter at new position 217 const uni_iter& operator++() 218 { 219 it += uni_forward( it ); 220 return *this; 221 } 222 223 /// post-increment and return uni_iter at initial position 224 uni_iter operator++( int ) 225 { 226 uni_iter ret = *this; 227 228 it += uni_forward( it ); 229 return ret; 230 } 231 232 /// return unicode at current position 233 unsigned operator->() const 234 { 235 unsigned result; 236 237 // grab the result, do not advance 238 uni_forward( it, &result ); 239 return result; 240 } 241 242 /// return unicode at current position 243 unsigned operator*() const 244 { 245 unsigned result; 246 247 // grab the result, do not advance 248 uni_forward( it, &result ); 249 return result; 250 } 251 252 uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); } 253 254 bool operator==( const uni_iter& other ) const { return it == other.it; } 255 bool operator!=( const uni_iter& other ) const { return it != other.it; } 256 257 /// Since the ++ operators advance more than one byte, this is your best 258 /// loop termination test, < end(), not == end(). 259 bool operator< ( const uni_iter& other ) const { return it < other.it; } 260 bool operator<=( const uni_iter& other ) const { return it <= other.it; } 261 bool operator> ( const uni_iter& other ) const { return it > other.it; } 262 bool operator>=( const uni_iter& other ) const { return it >= other.it; } 263 264 private: 265 friend class UTF8; 266 267 const unsigned char* it; 268 269 // private constructor uni_iter(const char * start)270 uni_iter( const char* start ) : 271 it( (const unsigned char*) start ) 272 { 273 } 274 }; 275 276 /** 277 * Returns a @a uni_iter initialized to the start of "this" UTF8 byte sequence. 278 */ ubegin()279 uni_iter ubegin() const 280 { 281 return uni_iter( m_s.data() ); 282 } 283 284 /** 285 * Return a @a uni_iter initialized to the end of "this" UTF8 byte sequence. 286 */ uend()287 uni_iter uend() const 288 { 289 return uni_iter( m_s.data() + m_s.size() ); 290 } 291 292 /** 293 * Advance over a single UTF8 encoded multibyte character, capturing the Unicode character 294 * as it goes, and returning the number of bytes consumed. 295 * 296 * @param aSequence is the UTF8 byte sequence, must be aligned on start of character. 297 * @param aResult is where to put the unicode character, and may be NULL if no interest. 298 * @return the count of bytes consumed. 299 */ 300 static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr ); 301 #endif // SWIG 302 303 protected: 304 std::string m_s; 305 }; 306 307 308 #endif // UTF8_H_ 309