1 /*
2 Open Asset Import Library (assimp)
3 ----------------------------------------------------------------------
4
5 Copyright (c) 2006-2015, assimp team
6 All rights reserved.
7
8 Redistribution and use of this software in source and binary forms,
9 with or without modification, are permitted provided that the
10 following conditions are met:
11
12 * Redistributions of source code must retain the above
13 copyright notice, this list of conditions and the
14 following disclaimer.
15
16 * Redistributions in binary form must reproduce the above
17 copyright notice, this list of conditions and the
18 following disclaimer in the documentation and/or other
19 materials provided with the distribution.
20
21 * Neither the name of the assimp team, nor the names of its
22 contributors may be used to endorse or promote products
23 derived from this software without specific prior
24 written permission of the assimp team.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37
38 ----------------------------------------------------------------------
39 */
40
41 /** @file STEPFileEncoding.cpp
42 * @brief STEP character handling, string unescaping
43 */
44 #include "STEPFileEncoding.h"
45 #include "fast_atof.h"
46
47 #include "../contrib/ConvertUTF/ConvertUTF.h"
48 #include <boost/scoped_array.hpp>
49
50 using namespace Assimp;
51
52 // roman1 to utf16 table
53 static const UTF16 mac_codetable[] = {
54 // 0x20 unassig./nonprint. slots
55 0x0020 ,
56 0x0021 ,
57 0x0022 ,
58 0x0023 ,
59 0x0024 ,
60 0x0025 ,
61 0x0026 ,
62 0x0027 ,
63 0x0028 ,
64 0x0029 ,
65 0x002A ,
66 0x002B ,
67 0x002C ,
68 0x002D ,
69 0x002E ,
70 0x002F ,
71 0x0030 ,
72 0x0031 ,
73 0x0032 ,
74 0x0033 ,
75 0x0034 ,
76 0x0035 ,
77 0x0036 ,
78 0x0037 ,
79 0x0038 ,
80 0x0039 ,
81 0x003A ,
82 0x003B ,
83 0x003C ,
84 0x003D ,
85 0x003E ,
86 0x003F ,
87 0x0040 ,
88 0x0041 ,
89 0x0042 ,
90 0x0043 ,
91 0x0044 ,
92 0x0045 ,
93 0x0046 ,
94 0x0047 ,
95 0x0048 ,
96 0x0049 ,
97 0x004A ,
98 0x004B ,
99 0x004C ,
100 0x004D ,
101 0x004E ,
102 0x004F ,
103 0x0050 ,
104 0x0051 ,
105 0x0052 ,
106 0x0053 ,
107 0x0054 ,
108 0x0055 ,
109 0x0056 ,
110 0x0057 ,
111 0x0058 ,
112 0x0059 ,
113 0x005A ,
114 0x005B ,
115 0x005C ,
116 0x005D ,
117 0x005E ,
118 0x005F ,
119 0x0060 ,
120 0x0061 ,
121 0x0062 ,
122 0x0063 ,
123 0x0064 ,
124 0x0065 ,
125 0x0066 ,
126 0x0067 ,
127 0x0068 ,
128 0x0069 ,
129 0x006A ,
130 0x006B ,
131 0x006C ,
132 0x006D ,
133 0x006E ,
134 0x006F ,
135 0x0070 ,
136 0x0071 ,
137 0x0072 ,
138 0x0073 ,
139 0x0074 ,
140 0x0075 ,
141 0x0076 ,
142 0x0077 ,
143 0x0078 ,
144 0x0079 ,
145 0x007A ,
146 0x007B ,
147 0x007C ,
148 0x007D ,
149 0x007E ,
150 0x0000 , // unassig.
151 0x00C4 ,
152 0x00C5 ,
153 0x00C7 ,
154 0x00C9 ,
155 0x00D1 ,
156 0x00D6 ,
157 0x00DC ,
158 0x00E1 ,
159 0x00E0 ,
160 0x00E2 ,
161 0x00E4 ,
162 0x00E3 ,
163 0x00E5 ,
164 0x00E7 ,
165 0x00E9 ,
166 0x00E8 ,
167 0x00EA ,
168 0x00EB ,
169 0x00ED ,
170 0x00EC ,
171 0x00EE ,
172 0x00EF ,
173 0x00F1 ,
174 0x00F3 ,
175 0x00F2 ,
176 0x00F4 ,
177 0x00F6 ,
178 0x00F5 ,
179 0x00FA ,
180 0x00F9 ,
181 0x00FB ,
182 0x00FC ,
183 0x2020 ,
184 0x00B0 ,
185 0x00A2 ,
186 0x00A3 ,
187 0x00A7 ,
188 0x2022 ,
189 0x00B6 ,
190 0x00DF ,
191 0x00AE ,
192 0x00A9 ,
193 0x2122 ,
194 0x00B4 ,
195 0x00A8 ,
196 0x2260 ,
197 0x00C6 ,
198 0x00D8 ,
199 0x221E ,
200 0x00B1 ,
201 0x2264 ,
202 0x2265 ,
203 0x00A5 ,
204 0x00B5 ,
205 0x2202 ,
206 0x2211 ,
207 0x220F ,
208 0x03C0 ,
209 0x222B ,
210 0x00AA ,
211 0x00BA ,
212 0x03A9 ,
213 0x00E6 ,
214 0x00F8 ,
215 0x00BF ,
216 0x00A1 ,
217 0x00AC ,
218 0x221A ,
219 0x0192 ,
220 0x2248 ,
221 0x2206 ,
222 0x00AB ,
223 0x00BB ,
224 0x2026 ,
225 0x00A0 ,
226 0x00C0 ,
227 0x00C3 ,
228 0x00D5 ,
229 0x0152 ,
230 0x0153 ,
231 0x2013 ,
232 0x2014 ,
233 0x201C ,
234 0x201D ,
235 0x2018 ,
236 0x2019 ,
237 0x00F7 ,
238 0x25CA ,
239 0x00FF ,
240 0x0178 ,
241 0x2044 ,
242 0x20AC ,
243 0x2039 ,
244 0x203A ,
245 0xFB01 ,
246 0xFB02 ,
247 0x2021 ,
248 0x00B7 ,
249 0x201A ,
250 0x201E ,
251 0x2030 ,
252 0x00C2 ,
253 0x00CA ,
254 0x00C1 ,
255 0x00CB ,
256 0x00C8 ,
257 0x00CD ,
258 0x00CE ,
259 0x00CF ,
260 0x00CC ,
261 0x00D3 ,
262 0x00D4 ,
263 0xF8FF ,
264 0x00D2 ,
265 0x00DA ,
266 0x00DB ,
267 0x00D9 ,
268 0x0131 ,
269 0x02C6 ,
270 0x02DC ,
271 0x00AF ,
272 0x02D8 ,
273 0x02D9 ,
274 0x02DA ,
275 0x00B8 ,
276 0x02DD ,
277 0x02DB ,
278 0x02C7
279 };
280
281 // ------------------------------------------------------------------------------------------------
StringToUTF8(std::string & s)282 bool STEP::StringToUTF8(std::string& s)
283 {
284 // very basic handling for escaped string sequences
285 // http://doc.spatial.com/index.php?title=InterOp:Connect/STEP&redirect=no
286
287 for (size_t i = 0; i < s.size(); ) {
288 if (s[i] == '\\') {
289 // \S\X - cp1252 (X is the character remapped to [0,127])
290 if (i+3 < s.size() && s[i+1] == 'S' && s[i+2] == '\\') {
291 // http://stackoverflow.com/questions/5586214/how-to-convert-char-from-iso-8859-1-to-utf-8-in-c-multiplatformly
292 ai_assert((uint8_t)s[i+3] < 0x80);
293 const uint8_t ch = s[i+3] + 0x80;
294
295 s[i] = 0xc0 | (ch & 0xc0) >> 6;
296 s[i+1] = 0x80 | (ch & 0x3f);
297
298 s.erase(i + 2,2);
299 ++i;
300 }
301 // \X\xx - mac/roman (xx is a hex sequence)
302 else if (i+4 < s.size() && s[i+1] == 'X' && s[i+2] == '\\') {
303
304 const uint8_t macval = HexOctetToDecimal(s.c_str() + i + 3);
305 if(macval < 0x20) {
306 return false;
307 }
308
309 ai_assert(sizeof(mac_codetable) / sizeof(mac_codetable[0]) == 0x100-0x20);
310
311 const UTF32 unival = mac_codetable[macval - 0x20], *univalp = &unival;
312
313 UTF8 temp[5], *tempp = temp;
314 ai_assert(sizeof(UTF8) == 1);
315
316 if(ConvertUTF32toUTF8(&univalp, univalp+1, &tempp, tempp+sizeof(temp), lenientConversion) != conversionOK) {
317 return false;
318 }
319
320 const size_t outcount = static_cast<size_t>(tempp-temp);
321
322 s.erase(i,5);
323 s.insert(i, reinterpret_cast<char*>(temp), outcount);
324 i += outcount;
325 }
326 // \Xn\ .. \X0\ - various unicode encodings (n=2: utf16; n=4: utf32)
327 else if (i+3 < s.size() && s[i+1] == 'X' && s[i+2] >= '0' && s[i+2] <= '9') {
328 switch(s[i+2]) {
329 // utf16
330 case '2':
331 // utf32
332 case '4':
333 if (s[i+3] == '\\') {
334 const size_t basei = i+4;
335 size_t j = basei, jend = s.size()-3;
336
337 for (; j < jend; ++j) {
338 if (s[j] == '\\' && s[j] == 'X' && s[j] == '0' && s[j] == '\\') {
339 break;
340 }
341 }
342 if (j == jend) {
343 return false;
344 }
345
346 if (j == basei) {
347 s.erase(i,8);
348 continue;
349 }
350
351 if (s[i+2] == '2') {
352 if (((j - basei) % 4) != 0) {
353 return false;
354 }
355
356 const size_t count = (j-basei)/4;
357 boost::scoped_array<UTF16> src(new UTF16[count]);
358
359 const char* cur = s.c_str() + basei;
360 for (size_t k = 0; k < count; ++k, cur += 4) {
361 src[k] = (static_cast<UTF16>(HexOctetToDecimal(cur)) << 8u) |
362 static_cast<UTF16>(HexOctetToDecimal(cur+2));
363 }
364
365 const size_t dcount = count * 3; // this is enough to hold all possible outputs
366 boost::scoped_array<UTF8> dest(new UTF8[dcount]);
367
368 const UTF16* srct = src.get();
369 UTF8* destt = dest.get();
370 if(ConvertUTF16toUTF8(&srct, srct+count, &destt, destt+dcount, lenientConversion) != conversionOK) {
371 return false;
372 }
373
374 const size_t outcount = static_cast<size_t>(destt-dest.get());
375
376 s.erase(i,(j+4-i));
377
378 ai_assert(sizeof(UTF8) == 1);
379 s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
380
381 i += outcount;
382 continue;
383 }
384 else if (s[i+2] == '4') {
385 if (((j - basei) % 8) != 0) {
386 return false;
387 }
388
389 const size_t count = (j-basei)/8;
390 boost::scoped_array<UTF32> src(new UTF32[count]);
391
392 const char* cur = s.c_str() + basei;
393 for (size_t k = 0; k < count; ++k, cur += 8) {
394 src[k] = (static_cast<UTF32>(HexOctetToDecimal(cur )) << 24u) |
395 (static_cast<UTF32>(HexOctetToDecimal(cur+2)) << 16u) |
396 (static_cast<UTF32>(HexOctetToDecimal(cur+4)) << 8u) |
397 (static_cast<UTF32>(HexOctetToDecimal(cur+6)));
398 }
399
400 const size_t dcount = count * 5; // this is enough to hold all possible outputs
401 boost::scoped_array<UTF8> dest(new UTF8[dcount]);
402
403 const UTF32* srct = src.get();
404 UTF8* destt = dest.get();
405 if(ConvertUTF32toUTF8(&srct, srct+count, &destt, destt+dcount, lenientConversion) != conversionOK) {
406 return false;
407 }
408
409 const size_t outcount = static_cast<size_t>(destt-dest.get());
410
411 s.erase(i,(j+4-i));
412
413 ai_assert(sizeof(UTF8) == 1);
414 s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
415
416 i += outcount;
417 continue;
418 }
419 }
420
421 break;
422
423 // TODO: other encoding patterns?
424
425 default:
426 return false;
427 }
428 }
429 }
430 ++i;
431 }
432 return true;
433 }
434