1 /*
2  * This program source code file is part of KiCad, a free EDA CAD application.
3  *
4  * Copyright (C) 2013-2017 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
5  * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, you may find one here:
19  * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
20  * or you may search the http://www.gnu.org website for the version 2 license,
21  * or you may write to the Free Software Foundation, Inc.,
22  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
23  */
24 
25 #include <utf8.h>
26 #include <ki_exception.h>
27 #include <wx/strconv.h>
28 #include <wx/buffer.h>
29 #include <vector>
30 
31 #include <cassert>
32 
33 
34 /*
35     These are not inlined so that code space is saved by encapsulating the
36     creation of intermediate objects and the referencing of wxConvUTF8.
37 */
38 
39 
UTF8(const wxString & o)40 UTF8::UTF8( const wxString& o ) :
41     m_s( (const char*) o.utf8_str() )
42 {
43 }
44 
45 
wx_str() const46 wxString UTF8::wx_str() const
47 {
48     return wxString( c_str(), wxConvUTF8 );
49 }
50 
51 
operator wxString() const52 UTF8::operator wxString () const
53 {
54     return wxString( c_str(), wxConvUTF8 );
55 }
56 
57 
operator =(const wxString & o)58 UTF8& UTF8::operator=( const wxString& o )
59 {
60     m_s = (const char*) o.utf8_str();
61     return *this;
62 }
63 
64 
65 // There is no wxWidgets function that does this, because wchar_t is 16 bits
66 // on windows and wx wants to encode the output in UTF16 for such.
67 
uni_forward(const unsigned char * aSequence,unsigned * aResult)68 int UTF8::uni_forward( const unsigned char* aSequence, unsigned* aResult )
69 {
70     unsigned ch = *aSequence;
71 
72     if( ch < 0x80 )
73     {
74         if( aResult )
75             *aResult = ch;
76         return 1;
77     }
78 
79     const unsigned char* s = aSequence;
80 
81     static const unsigned char utf8_len[] = {
82         // Map encoded prefix byte to sequence length.  Zero means
83         // illegal prefix.  See RFC 3629 for details
84         /*
85         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
86         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
91         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
93         */
94         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
95         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
98         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
99         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
100         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
101         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // F0-F4 + F5-FF
102     };
103 
104     int len = utf8_len[ *s - 0x80  /* top half of table is missing */ ];
105 
106     switch( len )
107     {
108     default:
109     case 0:
110         if( aResult )
111             wxFAIL_MSG( "uni_forward: invalid start byte" );
112 
113         return 0;
114         break;
115 
116     case 2:
117         if( ( s[1] & 0xc0 ) != 0x80 )
118         {
119             if( aResult )
120                 wxFAIL_MSG( "uni_forward: invalid continuation byte" );
121 
122             return 0;
123         }
124 
125         ch =    ((s[0] & 0x1f) << 6) +
126                 ((s[1] & 0x3f) << 0);
127 
128         // assert( ch > 0x007F && ch <= 0x07FF );
129         break;
130 
131     case 3:
132         if( (s[1] & 0xc0) != 0x80 ||
133             (s[2] & 0xc0) != 0x80 ||
134             (s[0] == 0xE0 && s[1] < 0xA0)
135             // || (s[0] == 0xED && s[1] > 0x9F)
136         )
137         {
138             if( aResult )
139                 wxFAIL_MSG( "uni_forward: invalid continuation byte" );
140 
141             return 0;
142         }
143 
144         ch =    ((s[0] & 0x0f) << 12) +
145                 ((s[1] & 0x3f) << 6 ) +
146                 ((s[2] & 0x3f) << 0 );
147 
148         // assert( ch > 0x07FF && ch <= 0xFFFF );
149         break;
150 
151     case 4:
152         if( (s[1] & 0xc0) != 0x80 ||
153             (s[2] & 0xc0) != 0x80 ||
154             (s[3] & 0xc0) != 0x80 ||
155             (s[0] == 0xF0 && s[1] < 0x90) ||
156             (s[0] == 0xF4 && s[1] > 0x8F) )
157         {
158             if( aResult )
159                 wxFAIL_MSG( "uni_forward: invalid continuation byte" );
160 
161             return 0;
162         }
163 
164         ch =    ((s[0] & 0x7)  << 18) +
165                 ((s[1] & 0x3f) << 12) +
166                 ((s[2] & 0x3f) << 6 ) +
167                 ((s[3] & 0x3f) << 0 );
168 
169         // assert( ch > 0xFFFF && ch <= 0x10ffff );
170         break;
171     }
172 
173     if( aResult )
174         *aResult = ch;
175 
176     return len;
177 }
178 
179 
IsUTF8(const char * aString)180 bool IsUTF8( const char* aString )
181 {
182     int len = strlen( aString );
183 
184     if( len )
185     {
186         const unsigned char* next = (unsigned char*) aString;
187         const unsigned char* end  = next + len;
188 
189         while( next < end )
190         {
191             int charLen = UTF8::uni_forward( next, nullptr );
192 
193             if( charLen == 0 )
194                 return false;
195 
196             next += charLen;
197         }
198 
199         // uni_forward() should find the exact end if it is truly UTF8
200         if( next > end )
201             return false;
202     }
203 
204     return true;
205 }
206 
207 
UTF8(const wchar_t * txt)208 UTF8::UTF8( const wchar_t* txt )
209 {
210     try
211     {
212         std::vector< char > temp( wcslen( txt ) * 4 + 1 );
213         wxConvUTF8.WC2MB( temp.data(), txt, temp.size() );
214         m_s.assign( temp.data() );
215     }
216     catch(...)
217     {
218         auto string = wxSafeConvertWX2MB( txt );
219         m_s.assign( string );
220     }
221 
222     m_s.shrink_to_fit();
223 }
224 
225 
operator +=(unsigned w_ch)226 UTF8& UTF8::operator+=( unsigned w_ch )
227 {
228     if( w_ch <= 0x7F )
229     {
230         m_s.operator+=( char( w_ch ) );
231     }
232     else
233     {
234         //TODO: Remove wchar use.  Replace with std::byte*
235         wchar_t wide_chr[2];    // buffer to store wide chars (UTF16) read from aText
236         wide_chr[1] = 0;
237         wide_chr[0] = w_ch;
238         UTF8 substr( wide_chr );
239         m_s += substr.m_s;
240     }
241 
242     return *this;
243 }
244