1 /*
2 Open Asset Import Library (assimp)
3 ----------------------------------------------------------------------
4 
5 Copyright (c) 2006-2015, assimp team
6 All rights reserved.
7 
8 Redistribution and use of this software in source and binary forms,
9 with or without modification, are permitted provided that the
10 following conditions are met:
11 
12 * Redistributions of source code must retain the above
13   copyright notice, this list of conditions and the
14   following disclaimer.
15 
16 * Redistributions in binary form must reproduce the above
17   copyright notice, this list of conditions and the
18   following disclaimer in the documentation and/or other
19   materials provided with the distribution.
20 
21 * Neither the name of the assimp team, nor the names of its
22   contributors may be used to endorse or promote products
23   derived from this software without specific prior
24   written permission of the assimp team.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 
38 ----------------------------------------------------------------------
39 */
40 
41 /** @file  STEPFileEncoding.cpp
42  *  @brief STEP character handling, string unescaping
43  */
44 #include "STEPFileEncoding.h"
45 #include "fast_atof.h"
46 
47 #include "../contrib/ConvertUTF/ConvertUTF.h"
48 #include <boost/scoped_array.hpp>
49 
50 using namespace Assimp;
51 
52 // roman1 to utf16 table
53 static const UTF16 mac_codetable[] = {
54     // 0x20 unassig./nonprint. slots
55      0x0020 ,
56      0x0021 ,
57      0x0022 ,
58      0x0023 ,
59      0x0024 ,
60      0x0025 ,
61      0x0026 ,
62      0x0027 ,
63      0x0028 ,
64      0x0029 ,
65      0x002A ,
66      0x002B ,
67      0x002C ,
68      0x002D ,
69      0x002E ,
70      0x002F ,
71      0x0030 ,
72      0x0031 ,
73      0x0032 ,
74      0x0033 ,
75      0x0034 ,
76      0x0035 ,
77      0x0036 ,
78      0x0037 ,
79      0x0038 ,
80      0x0039 ,
81      0x003A ,
82      0x003B ,
83      0x003C ,
84      0x003D ,
85      0x003E ,
86      0x003F ,
87      0x0040 ,
88      0x0041 ,
89      0x0042 ,
90      0x0043 ,
91      0x0044 ,
92      0x0045 ,
93      0x0046 ,
94      0x0047 ,
95      0x0048 ,
96      0x0049 ,
97      0x004A ,
98      0x004B ,
99      0x004C ,
100      0x004D ,
101      0x004E ,
102      0x004F ,
103      0x0050 ,
104      0x0051 ,
105      0x0052 ,
106      0x0053 ,
107      0x0054 ,
108      0x0055 ,
109      0x0056 ,
110      0x0057 ,
111      0x0058 ,
112      0x0059 ,
113      0x005A ,
114      0x005B ,
115      0x005C ,
116      0x005D ,
117      0x005E ,
118      0x005F ,
119      0x0060 ,
120      0x0061 ,
121      0x0062 ,
122      0x0063 ,
123      0x0064 ,
124      0x0065 ,
125      0x0066 ,
126      0x0067 ,
127      0x0068 ,
128      0x0069 ,
129      0x006A ,
130      0x006B ,
131      0x006C ,
132      0x006D ,
133      0x006E ,
134      0x006F ,
135      0x0070 ,
136      0x0071 ,
137      0x0072 ,
138      0x0073 ,
139      0x0074 ,
140      0x0075 ,
141      0x0076 ,
142      0x0077 ,
143      0x0078 ,
144      0x0079 ,
145      0x007A ,
146      0x007B ,
147      0x007C ,
148      0x007D ,
149      0x007E ,
150      0x0000 , // unassig.
151      0x00C4 ,
152      0x00C5 ,
153      0x00C7 ,
154      0x00C9 ,
155      0x00D1 ,
156      0x00D6 ,
157      0x00DC ,
158      0x00E1 ,
159      0x00E0 ,
160      0x00E2 ,
161      0x00E4 ,
162      0x00E3 ,
163      0x00E5 ,
164      0x00E7 ,
165      0x00E9 ,
166      0x00E8 ,
167      0x00EA ,
168      0x00EB ,
169      0x00ED ,
170      0x00EC ,
171      0x00EE ,
172      0x00EF ,
173      0x00F1 ,
174      0x00F3 ,
175      0x00F2 ,
176      0x00F4 ,
177      0x00F6 ,
178      0x00F5 ,
179      0x00FA ,
180      0x00F9 ,
181      0x00FB ,
182      0x00FC ,
183      0x2020 ,
184      0x00B0 ,
185      0x00A2 ,
186      0x00A3 ,
187      0x00A7 ,
188      0x2022 ,
189      0x00B6 ,
190      0x00DF ,
191      0x00AE ,
192      0x00A9 ,
193      0x2122 ,
194      0x00B4 ,
195      0x00A8 ,
196      0x2260 ,
197      0x00C6 ,
198      0x00D8 ,
199      0x221E ,
200      0x00B1 ,
201      0x2264 ,
202      0x2265 ,
203      0x00A5 ,
204      0x00B5 ,
205      0x2202 ,
206      0x2211 ,
207      0x220F ,
208      0x03C0 ,
209      0x222B ,
210      0x00AA ,
211      0x00BA ,
212      0x03A9 ,
213      0x00E6 ,
214      0x00F8 ,
215      0x00BF ,
216      0x00A1 ,
217      0x00AC ,
218      0x221A ,
219      0x0192 ,
220      0x2248 ,
221      0x2206 ,
222      0x00AB ,
223      0x00BB ,
224      0x2026 ,
225      0x00A0 ,
226      0x00C0 ,
227      0x00C3 ,
228      0x00D5 ,
229      0x0152 ,
230      0x0153 ,
231      0x2013 ,
232      0x2014 ,
233      0x201C ,
234      0x201D ,
235      0x2018 ,
236      0x2019 ,
237      0x00F7 ,
238      0x25CA ,
239      0x00FF ,
240      0x0178 ,
241      0x2044 ,
242      0x20AC ,
243      0x2039 ,
244      0x203A ,
245      0xFB01 ,
246      0xFB02 ,
247      0x2021 ,
248      0x00B7 ,
249      0x201A ,
250      0x201E ,
251      0x2030 ,
252      0x00C2 ,
253      0x00CA ,
254      0x00C1 ,
255      0x00CB ,
256      0x00C8 ,
257      0x00CD ,
258      0x00CE ,
259      0x00CF ,
260      0x00CC ,
261      0x00D3 ,
262      0x00D4 ,
263      0xF8FF ,
264      0x00D2 ,
265      0x00DA ,
266      0x00DB ,
267      0x00D9 ,
268      0x0131 ,
269      0x02C6 ,
270      0x02DC ,
271      0x00AF ,
272      0x02D8 ,
273      0x02D9 ,
274      0x02DA ,
275      0x00B8 ,
276      0x02DD ,
277      0x02DB ,
278      0x02C7
279 };
280 
281 // ------------------------------------------------------------------------------------------------
StringToUTF8(std::string & s)282 bool STEP::StringToUTF8(std::string& s)
283 {
284     // very basic handling for escaped string sequences
285     // http://doc.spatial.com/index.php?title=InterOp:Connect/STEP&redirect=no
286 
287     for (size_t i = 0; i < s.size(); ) {
288         if (s[i] == '\\') {
289             // \S\X - cp1252 (X is the character remapped to [0,127])
290             if (i+3 < s.size() && s[i+1] == 'S' && s[i+2] == '\\') {
291                 // http://stackoverflow.com/questions/5586214/how-to-convert-char-from-iso-8859-1-to-utf-8-in-c-multiplatformly
292                 ai_assert((uint8_t)s[i+3] < 0x80);
293                 const uint8_t ch = s[i+3] + 0x80;
294 
295                 s[i] = 0xc0 | (ch & 0xc0) >> 6;
296                 s[i+1] =  0x80 | (ch & 0x3f);
297 
298                 s.erase(i + 2,2);
299                 ++i;
300             }
301             // \X\xx - mac/roman (xx is a hex sequence)
302             else if (i+4 < s.size() && s[i+1] == 'X' && s[i+2] == '\\') {
303 
304                 const uint8_t macval = HexOctetToDecimal(s.c_str() + i + 3);
305                 if(macval < 0x20) {
306                     return false;
307                 }
308 
309                 ai_assert(sizeof(mac_codetable) / sizeof(mac_codetable[0]) == 0x100-0x20);
310 
311                 const UTF32 unival = mac_codetable[macval - 0x20], *univalp = &unival;
312 
313                 UTF8 temp[5], *tempp = temp;
314                 ai_assert(sizeof(UTF8) == 1);
315 
316                 if(ConvertUTF32toUTF8(&univalp, univalp+1, &tempp, tempp+sizeof(temp), lenientConversion) != conversionOK) {
317                     return false;
318                 }
319 
320                 const size_t outcount = static_cast<size_t>(tempp-temp);
321 
322                 s.erase(i,5);
323                 s.insert(i, reinterpret_cast<char*>(temp), outcount);
324                 i += outcount;
325             }
326             // \Xn\ .. \X0\ - various unicode encodings (n=2: utf16; n=4: utf32)
327             else if (i+3 < s.size() && s[i+1] == 'X' && s[i+2] >= '0' && s[i+2] <= '9') {
328                 switch(s[i+2]) {
329                     // utf16
330                 case '2':
331                     // utf32
332                 case '4':
333                     if (s[i+3] == '\\') {
334                         const size_t basei = i+4;
335                         size_t j = basei, jend = s.size()-3;
336 
337                         for (; j < jend; ++j) {
338                             if (s[j] == '\\' && s[j] == 'X' && s[j] == '0' && s[j] == '\\') {
339                                 break;
340                             }
341                         }
342                         if (j == jend) {
343                             return false;
344                         }
345 
346                         if (j == basei) {
347                             s.erase(i,8);
348                             continue;
349                         }
350 
351                         if (s[i+2] == '2') {
352                             if (((j - basei) % 4) != 0) {
353                                 return false;
354                             }
355 
356                             const size_t count = (j-basei)/4;
357                             boost::scoped_array<UTF16> src(new UTF16[count]);
358 
359                             const char* cur = s.c_str() + basei;
360                             for (size_t k = 0; k < count; ++k, cur += 4) {
361                                 src[k] = (static_cast<UTF16>(HexOctetToDecimal(cur)) << 8u)  |
362                                      static_cast<UTF16>(HexOctetToDecimal(cur+2));
363                             }
364 
365                             const size_t dcount = count * 3; // this is enough to hold all possible outputs
366                             boost::scoped_array<UTF8> dest(new UTF8[dcount]);
367 
368                             const UTF16* srct = src.get();
369                             UTF8* destt = dest.get();
370                             if(ConvertUTF16toUTF8(&srct, srct+count, &destt, destt+dcount, lenientConversion) != conversionOK) {
371                                 return false;
372                             }
373 
374                             const size_t outcount = static_cast<size_t>(destt-dest.get());
375 
376                             s.erase(i,(j+4-i));
377 
378                             ai_assert(sizeof(UTF8) == 1);
379                             s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
380 
381                             i += outcount;
382                             continue;
383                         }
384                         else if (s[i+2] == '4') {
385                             if (((j - basei) % 8) != 0) {
386                                 return false;
387                             }
388 
389                             const size_t count = (j-basei)/8;
390                             boost::scoped_array<UTF32> src(new UTF32[count]);
391 
392                             const char* cur = s.c_str() + basei;
393                             for (size_t k = 0; k < count; ++k, cur += 8) {
394                                 src[k] = (static_cast<UTF32>(HexOctetToDecimal(cur  )) << 24u) |
395                                          (static_cast<UTF32>(HexOctetToDecimal(cur+2)) << 16u) |
396                                          (static_cast<UTF32>(HexOctetToDecimal(cur+4)) << 8u)  |
397                                          (static_cast<UTF32>(HexOctetToDecimal(cur+6)));
398                             }
399 
400                             const size_t dcount = count * 5; // this is enough to hold all possible outputs
401                             boost::scoped_array<UTF8> dest(new UTF8[dcount]);
402 
403                             const UTF32* srct = src.get();
404                             UTF8* destt = dest.get();
405                             if(ConvertUTF32toUTF8(&srct, srct+count, &destt, destt+dcount, lenientConversion) != conversionOK) {
406                                 return false;
407                             }
408 
409                             const size_t outcount = static_cast<size_t>(destt-dest.get());
410 
411                             s.erase(i,(j+4-i));
412 
413                             ai_assert(sizeof(UTF8) == 1);
414                             s.insert(i, reinterpret_cast<char*>(dest.get()), outcount);
415 
416                             i += outcount;
417                             continue;
418                         }
419                     }
420 
421                     break;
422 
423                     // TODO: other encoding patterns?
424 
425                 default:
426                     return false;
427                 }
428             }
429         }
430         ++i;
431     }
432     return true;
433 }
434