1 /*
2  * This program source code file is part of KiCad, a free EDA CAD application.
3  *
4  * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
5  * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors.
6  *
7  * @author Dick Hollenbeck
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License
11  * as published by the Free Software Foundation; either version 2
12  * of the License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, you may find one here:
21  * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
22  * or you may search the http://www.gnu.org website for the version 2 license,
23  * or you may write to the Free Software Foundation, Inc.,
24  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
25  */
26 
27 #ifndef UTF8_H_
28 #define UTF8_H_
29 
30 #include <string>
31 #include <wx/string.h>
32 
33 #if defined(DEBUG)
34  #define UTF8_VERIFY    // Might someday be a hidden cmake config option
35 #endif
36 
37 
38 /**
39  * Test a C string to see if it is UTF8 encoded.
40  *
41  * An ASCII string is a valid UTF8 string.
42  */
43 bool IsUTF8( const char* aString );
44 
45 
46 #if defined(UTF8_VERIFY)
47  #define MAYBE_VERIFY_UTF8(x)       wxASSERT( IsUTF8(x) )
48 #else
49  #define MAYBE_VERIFY_UTF8(x)       // nothing
50 #endif
51 
52 
53 /**
54  * An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion
55  * support to and from wxString, to and from std::string, and has non-mutating iteration
56  * over Unicode characters.
57  *
58  * I've been careful to supply only conversion facilities and not try and duplicate
59  * wxString() with many member functions. There are multiple ways to create text into
60  * a std::string without the need of too many member functions:
61  *
62  *  - richio.h's StrPrintf().
63  *  - std::ostringstream.
64  *
65  * Because this class uses no virtuals, it should be possible to cast any std::string
66  * into a UTF8 using this kind of cast: (UTF8 &) without construction or copying being
67  * the effect of the cast.  Be sure the source std::string holds UTF8 encoded text before
68  * you do that.
69  */
70 class UTF8
71 {
72 public:
73     UTF8( const wxString& o );
74 
75     /// This is a constructor for which you could end up with
76     /// non-UTF8 encoding, but that would be your fault.
UTF8(const char * txt)77     UTF8( const char* txt ) :
78         m_s( txt )
79     {
80         MAYBE_VERIFY_UTF8( c_str() );
81     }
82 
83     /// For use with _() function on wx 2.8.
84     /// BTW _() on wx >= 2.9 returns wxString, not wchar_t* like on 2.8.
85     UTF8( const wchar_t* txt );
86 
UTF8(const std::string & o)87     UTF8( const std::string& o ) :
88         m_s( o )
89     {
90         MAYBE_VERIFY_UTF8( c_str() );
91     }
92 
UTF8()93     UTF8()
94     {
95     }
96 
~UTF8()97     ~UTF8()     // Needed mainly to build python wrapper
98     {
99     }
100 
101     // expose some std::string functions publicly, since base class must be private.
c_str()102     const char* c_str()                         const   { return m_s.c_str(); }
empty()103     bool empty()                                const   { return m_s.empty(); }
104 
find(char c)105     std::string::size_type find( char c )       const   { return m_s.find( c ); }
find(char c,size_t s)106     std::string::size_type find( char c, size_t s )     const   { return m_s.find( c, s ); }
107 
clear()108     void clear()                                        { m_s.clear(); }
length()109     std::string::size_type length()             const   { return m_s.length(); }
size()110     std::string::size_type size()               const   { return m_s.size(); }
compare(const std::string & s)111     int compare( const std::string& s )         const   { return m_s.compare( s ); }
112 
113     bool operator==( const UTF8& rhs )          const   { return m_s == rhs.m_s; }
114     bool operator==( const std::string& rhs )   const   { return m_s == rhs; }
115     bool operator==( const char* s )            const   { return m_s == s; }
116 
117     std::string::size_type find_first_of( const std::string& str,
118                                           std::string::size_type pos = 0 ) const
119     {
120         return m_s.find_first_of( str, pos );
121     }
122 
123     UTF8& operator+=( const UTF8& str )
124     {
125         m_s += str.m_s;
126         MAYBE_VERIFY_UTF8( c_str() );
127         return *this;
128     }
129 
130     UTF8& operator+=( char ch )
131     {
132         m_s.operator+=( ch );
133         MAYBE_VERIFY_UTF8( c_str() );
134         return *this;
135     }
136 
137     UTF8& operator+=( const char* s )
138     {
139         m_s.operator+=( s );
140         MAYBE_VERIFY_UTF8( c_str() );
141         return *this;
142     }
143 
144     /// Append a wide (unicode) char to the UTF8 string.
145     /// if this wide char is not a ASCII7 char, it will be added as a UTF8 multibyte sequence
146     /// @param w_ch is a UTF-16 value (can be a UTF-32 on Linux)
147     UTF8& operator+=( unsigned w_ch );
148 
149     // std::string::npos is not constexpr, so we can't use it in an
150     // initializer.
151     static constexpr std::string::size_type npos = -1;
152 
153     UTF8& operator=( const wxString& o );
154 
155     UTF8& operator=( const std::string& o )
156     {
157         m_s = o;
158         MAYBE_VERIFY_UTF8( c_str() );
159         return *this;
160     }
161 
162     UTF8& operator=( const char* s )
163     {
164         m_s = s;
165         MAYBE_VERIFY_UTF8( c_str() );
166         return *this;
167     }
168 
169     UTF8& operator=( char c )
170     {
171         m_s = c;
172         MAYBE_VERIFY_UTF8( c_str() );
173         return *this;
174     }
175 
176     // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
177     // was split, so return std::string not UTF8
178     std::string substr( size_t pos = 0, size_t len = npos ) const
179     {
180         return m_s.substr( pos, len );
181     }
182 
183     operator const std::string& () const    { return m_s; }
184     //operator std::string& ()                { return m_s; }
185     //operator std::string () const           { return m_s; }
186 
187     wxString wx_str() const;
188     operator wxString () const;
189 
190     // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
191     // over UTF8 (multi-byte) characters
begin()192     std::string::const_iterator begin()         const   { return m_s.begin(); }
end()193     std::string::const_iterator end()           const   { return m_s.end(); }
194 
195 #ifndef SWIG
196     /**
197      * uni_iter
198      * is a non-mutating iterator that walks through unicode code points in the UTF8 encoded
199      * string.  The normal ++(), ++(int), ->(), and *() operators are all supported
200      * for read only access and some return an unsigned holding the unicode character
201      * appropriate for the respective operator.
202      */
203     class uni_iter
204     {
205     public:
uni_iter()206         uni_iter()  // Needed only to build python wrapper, not used outside the wrapper
207         {
208             it = nullptr;
209         }
210 
uni_iter(const uni_iter & o)211         uni_iter( const uni_iter& o )
212         {
213             it = o.it;
214         }
215 
216         /// pre-increment and return uni_iter at new position
217         const uni_iter& operator++()
218         {
219             it += uni_forward( it );
220             return *this;
221         }
222 
223         /// post-increment and return uni_iter at initial position
224         uni_iter operator++( int )
225         {
226             uni_iter ret = *this;
227 
228             it += uni_forward( it );
229             return ret;
230         }
231 
232         /// return unicode at current position
233         unsigned operator->() const
234         {
235             unsigned    result;
236 
237             // grab the result, do not advance
238             uni_forward( it, &result );
239             return result;
240         }
241 
242         /// return unicode at current position
243         unsigned operator*() const
244         {
245             unsigned    result;
246 
247             // grab the result, do not advance
248             uni_forward( it, &result );
249             return result;
250         }
251 
252         uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
253 
254         bool operator==( const uni_iter& other ) const  { return it == other.it; }
255         bool operator!=( const uni_iter& other ) const  { return it != other.it; }
256 
257         /// Since the ++ operators advance more than one byte, this is your best
258         /// loop termination test, < end(), not == end().
259         bool operator< ( const uni_iter& other ) const  { return it <  other.it; }
260         bool operator<=( const uni_iter& other ) const  { return it <= other.it; }
261         bool operator> ( const uni_iter& other ) const  { return it >  other.it; }
262         bool operator>=( const uni_iter& other ) const  { return it >= other.it; }
263 
264     private:
265         friend class UTF8;
266 
267         const unsigned char* it;
268 
269         // private constructor
uni_iter(const char * start)270         uni_iter( const char* start ) :
271             it( (const unsigned char*) start )
272         {
273         }
274     };
275 
276     /**
277      * Returns a @a uni_iter initialized to the start of "this" UTF8 byte sequence.
278      */
ubegin()279     uni_iter ubegin() const
280     {
281         return uni_iter( m_s.data() );
282     }
283 
284     /**
285      * Return a @a uni_iter initialized to the end of "this" UTF8 byte sequence.
286      */
uend()287     uni_iter uend() const
288     {
289         return uni_iter( m_s.data() + m_s.size() );
290     }
291 
292     /**
293      * Advance over a single UTF8 encoded multibyte character, capturing the Unicode character
294      * as it goes, and returning the number of bytes consumed.
295      *
296      * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
297      * @param aResult is where to put the unicode character, and may be NULL if no interest.
298      * @return the count of bytes consumed.
299      */
300     static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr );
301 #endif  // SWIG
302 
303 protected:
304     std::string m_s;
305 };
306 
307 
308 #endif // UTF8_H_
309