1 // =================================================================================================
2 // ADOBE SYSTEMS INCORPORATED
3 // Copyright 2006-2007 Adobe Systems Incorporated
4 // All Rights Reserved
5 //
6 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
7 // of the Adobe license agreement accompanying it.
8 // =================================================================================================
9
10 #include "XMP_Environment.h" // ! This must be the first include.
11
12 #include "Reconcile_Impl.hpp"
13
14 #include "UnicodeConversions.hpp"
15
16 #if XMP_WinBuild
17 #elif XMP_MacBuild
18 #include "UnicodeConverter.h"
19 #endif
20
21 // =================================================================================================
22 /// \file Reconcile_Impl.cpp
23 /// \brief Implementation utilities for the legacy metadata reconciliation support.
24 ///
25 // =================================================================================================
26
27 // =================================================================================================
28 // IsASCII
29 // =======
30 //
31 // See if a string is 7 bit ASCII.
32
IsASCII(const void * strPtr,size_t strLen)33 static inline bool IsASCII ( const void * strPtr, size_t strLen )
34 {
35
36 for ( const XMP_Uns8 * strPos = (XMP_Uns8*)strPtr; strLen > 0; --strLen, ++strPos ) {
37 if ( *strPos >= 0x80 ) return false;
38 }
39
40 return true;
41
42 } // IsASCII
43
44 // =================================================================================================
45 // ReconcileUtils::IsUTF8
46 // ======================
47 //
48 // See if a string contains valid UTF-8. Allow nul bytes, they can appear inside of multi-part Exif
49 // strings. We don't use CodePoint_from_UTF8_Multi in UnicodeConversions because it throws an
50 // exception for non-Unicode and we don't need to actually compute the code points.
51
IsUTF8(const void * utf8Ptr,size_t utf8Len)52 bool ReconcileUtils::IsUTF8 ( const void * utf8Ptr, size_t utf8Len )
53 {
54 const XMP_Uns8 * utf8Pos = (XMP_Uns8*)utf8Ptr;
55 const XMP_Uns8 * utf8End = utf8Pos + utf8Len;
56
57 while ( utf8Pos < utf8End ) {
58
59 if ( *utf8Pos < 0x80 ) {
60
61 ++utf8Pos; // ASCII is UTF-8, tolerate nuls.
62
63 } else {
64
65 // -------------------------------------------------------------------------------------
66 // We've got a multibyte UTF-8 character. The first byte has the number of bytes as the
67 // number of high order 1 bits. The remaining bytes must have 1 and 0 as the top 2 bits.
68
69 #if 0 // *** This might be a more effcient way to count the bytes.
70 static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
71 size_t bytesNeeded = kByteCounts [ *utf8Pos >> 4 ];
72 if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((*utf8Pos & 0x08) != 0)) ) return false;
73 if ( (utf8Pos + bytesNeeded) > utf8End ) return false;
74 #endif
75
76 size_t bytesNeeded = 0; // Count the high order 1 bits in the first byte.
77 for ( XMP_Uns8 temp = *utf8Pos; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
78 // *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
79
80 if ( (bytesNeeded < 2) || (bytesNeeded > 4) || ((utf8Pos+bytesNeeded) > utf8End) ) return false;
81
82 for ( --bytesNeeded, ++utf8Pos; bytesNeeded > 0; --bytesNeeded, ++utf8Pos ) {
83 if ( (*utf8Pos >> 6) != 2 ) return false;
84 }
85
86 }
87
88 }
89
90 return true;
91
92 } // ReconcileUtils::IsUTF8
93
94 // =================================================================================================
95 // UTF8ToHostEncoding
96 // ==================
97
98 #if XMP_WinBuild
99
UTF8ToWinEncoding(UINT codePage,const XMP_Uns8 * utf8Ptr,size_t utf8Len,std::string * host)100 static void UTF8ToWinEncoding ( UINT codePage,
101 const XMP_Uns8 * utf8Ptr, size_t utf8Len, std::string * host )
102 {
103
104 std::string utf16; // WideCharToMultiByte wants native UTF-16.
105 ToUTF16Native ( (UTF8Unit*)utf8Ptr, utf8Len, &utf16 );
106
107 LPCWSTR utf16Ptr = (LPCWSTR) utf16.c_str();
108 size_t utf16Len = utf16.size() / 2;
109
110 int hostLen = WideCharToMultiByte ( codePage, 0, utf16Ptr, (int)utf16Len, 0, 0, 0, 0 );
111 host->assign ( hostLen, ' ' ); // Allocate space for the results.
112
113 (void) WideCharToMultiByte ( codePage, 0, utf16Ptr, (int)utf16Len, (LPSTR)host->data(), hostLen, 0, 0 );
114 XMP_Assert ( hostLen == host->size() );
115
116 } // UTF8ToWinEncoding
117
118 #elif XMP_MacBuild
119
UTF8ToMacEncoding(TextEncoding & destEncoding,const XMP_Uns8 * utf8Ptr,size_t utf8Len,std::string * host)120 static void UTF8ToMacEncoding ( TextEncoding & destEncoding,
121 const XMP_Uns8 * utf8Ptr, size_t utf8Len, std::string * host )
122 {
123 OSStatus err;
124
125 UnicodeMapping mappingInfo;
126 mappingInfo.mappingVersion = kUnicodeUseLatestMapping;
127 mappingInfo.otherEncoding = GetTextEncodingBase ( destEncoding );
128 mappingInfo.unicodeEncoding = CreateTextEncoding ( kTextEncodingUnicodeDefault,
129 kUnicodeNoSubset, kUnicodeUTF8Format );
130
131 UnicodeToTextInfo converterInfo;
132 err = CreateUnicodeToTextInfo ( &mappingInfo, &converterInfo );
133 if ( err != noErr ) XMP_Throw ( "CreateUnicodeToTextInfo failed", kXMPErr_ExternalFailure );
134
135 try { // ! Need to call DisposeUnicodeToTextInfo before exiting.
136
137 OptionBits convFlags = kUnicodeUseFallbacksMask |
138 kUnicodeLooseMappingsMask | kUnicodeDefaultDirectionMask;
139 ByteCount bytesRead, bytesWritten;
140
141 enum { kBufferLen = 1000 }; // Ought to be enough in practice, without using too much stack.
142 char buffer [kBufferLen];
143
144 host->reserve ( utf8Len ); // As good a guess as any.
145
146 while ( utf8Len > 0 ) {
147 // Ignore all errors from ConvertFromUnicodeToText. It returns info like "output
148 // buffer full" or "use substitution" as errors.
149 err = ConvertFromUnicodeToText ( converterInfo, utf8Len, (UniChar*)utf8Ptr, convFlags,
150 0, 0, 0, 0, kBufferLen, &bytesRead, &bytesWritten, buffer );
151 if ( bytesRead == 0 ) break; // Make sure forward progress happens.
152 host->append ( &buffer[0], bytesWritten );
153 utf8Ptr += bytesRead;
154 utf8Len -= bytesRead;
155 }
156
157 DisposeUnicodeToTextInfo ( &converterInfo );
158
159 } catch ( ... ) {
160
161 DisposeUnicodeToTextInfo ( &converterInfo );
162 throw;
163
164 }
165
166 } // UTF8ToMacEncoding
167
168 #elif XMP_UNIXBuild
169
170 // ! Does not exist, must not be called, for Generic UNIX builds. It is not clear at this time
171 // ! what notion of local encoding should be used for generic UNIX, especially in a server product.
172
173 #endif
174
175 // =================================================================================================
176 // ReconcileUtils::UTF8ToLocal
177 // ===========================
178
179 #if ! XMP_UNIXBuild
180 // ! Does not exist, must not be called, for Generic UNIX builds. It is not clear at this time
181 // ! what notion of local encoding should be used for generic UNIX, especially in a server product.
182
UTF8ToLocal(const void * _utf8Ptr,size_t utf8Len,std::string * local)183 void ReconcileUtils::UTF8ToLocal ( const void * _utf8Ptr, size_t utf8Len, std::string * local )
184 {
185 const XMP_Uns8* utf8Ptr = (XMP_Uns8*)_utf8Ptr;
186
187 local->erase();
188
189 if ( IsASCII ( utf8Ptr, utf8Len ) ) {
190 local->assign ( (const char *)utf8Ptr, utf8Len );
191 return;
192 }
193
194 #if XMP_WinBuild
195
196 UTF8ToWinEncoding ( CP_ACP, utf8Ptr, utf8Len, local );
197
198 #elif XMP_MacBuild
199
200 OSStatus err;
201
202 TextEncoding localEncoding;
203 err = UpgradeScriptInfoToTextEncoding ( smSystemScript,
204 kTextLanguageDontCare, kTextRegionDontCare, 0, &localEncoding );
205 if ( err != noErr ) XMP_Throw ( "UpgradeScriptInfoToTextEncoding failed", kXMPErr_ExternalFailure );
206
207 UTF8ToMacEncoding ( localEncoding, utf8Ptr, utf8Len, local );
208
209 #elif XMP_UNIXBuild
210
211 #error "No generic UNIX implementation"
212
213 #endif
214
215 } // ReconcileUtils::UTF8ToLocal
216
217 #endif
218
219 // =================================================================================================
220 // ReconcileUtils::UTF8ToLatin1
221 // ============================
222 //
223 // Actually to the Windows code page 1252 superset of 8859-1.
224
225 #if ! XMP_UNIXBuild
226 // ! Does not exist, must not be called, for Generic UNIX builds. At some point we could consider
227 // ! creating our own private implementation. So far only needed for the ID3 legacy in MP3 files.
228
UTF8ToLatin1(const void * _utf8Ptr,size_t utf8Len,std::string * latin1)229 void ReconcileUtils::UTF8ToLatin1 ( const void * _utf8Ptr, size_t utf8Len, std::string * latin1 )
230 {
231 const XMP_Uns8* utf8Ptr = (XMP_Uns8*)_utf8Ptr;
232
233 latin1->erase();
234
235 if ( IsASCII ( utf8Ptr, utf8Len ) ) {
236 latin1->assign ( (const char *)utf8Ptr, utf8Len );
237 return;
238 }
239
240 #if XMP_WinBuild
241
242 UTF8ToWinEncoding ( 1252, utf8Ptr, utf8Len, latin1 );
243
244 #elif XMP_MacBuild
245
246 TextEncoding latin1Encoding;
247 latin1Encoding = CreateTextEncoding ( kTextEncodingWindowsLatin1,
248 kTextEncodingDefaultVariant, kTextEncodingDefaultFormat );
249
250 UTF8ToMacEncoding ( latin1Encoding, utf8Ptr, utf8Len, latin1 );
251
252 #elif XMP_UNIXBuild
253
254 #error "No generic UNIX implementation"
255
256 #endif
257
258 } // ReconcileUtils::UTF8ToLatin1
259
260 #endif
261
262 // =================================================================================================
263 // HostEncodingToUTF8
264 // ==================
265
266 #if XMP_WinBuild
267
WinEncodingToUTF8(UINT codePage,const XMP_Uns8 * hostPtr,size_t hostLen,std::string * utf8)268 static void WinEncodingToUTF8 ( UINT codePage,
269 const XMP_Uns8 * hostPtr, size_t hostLen, std::string * utf8 )
270 {
271
272 int utf16Len = MultiByteToWideChar ( codePage, 0, (LPCSTR)hostPtr, (int)hostLen, 0, 0 );
273 std::vector<UTF16Unit> utf16 ( utf16Len, 0 ); // MultiByteToWideChar returns native UTF-16.
274
275 (void) MultiByteToWideChar ( codePage, 0, (LPCSTR)hostPtr, (int)hostLen, (LPWSTR)&utf16[0], utf16Len );
276 FromUTF16Native ( &utf16[0], (int)utf16Len, utf8 );
277
278 } // WinEncodingToUTF8
279
280 #elif XMP_MacBuild
281
MacEncodingToUTF8(TextEncoding & srcEncoding,const XMP_Uns8 * hostPtr,size_t hostLen,std::string * utf8)282 static void MacEncodingToUTF8 ( TextEncoding & srcEncoding,
283 const XMP_Uns8 * hostPtr, size_t hostLen, std::string * utf8 )
284 {
285 OSStatus err;
286
287 UnicodeMapping mappingInfo;
288 mappingInfo.mappingVersion = kUnicodeUseLatestMapping;
289 mappingInfo.otherEncoding = GetTextEncodingBase ( srcEncoding );
290 mappingInfo.unicodeEncoding = CreateTextEncoding ( kTextEncodingUnicodeDefault,
291 kUnicodeNoSubset, kUnicodeUTF8Format );
292
293 TextToUnicodeInfo converterInfo;
294 err = CreateTextToUnicodeInfo ( &mappingInfo, &converterInfo );
295 if ( err != noErr ) XMP_Throw ( "CreateTextToUnicodeInfo failed", kXMPErr_ExternalFailure );
296
297 try { // ! Need to call DisposeTextToUnicodeInfo before exiting.
298
299 ByteCount bytesRead, bytesWritten;
300
301 enum { kBufferLen = 1000 }; // Ought to be enough in practice, without using too much stack.
302 char buffer [kBufferLen];
303
304 utf8->reserve ( hostLen ); // As good a guess as any.
305
306 while ( hostLen > 0 ) {
307 // Ignore all errors from ConvertFromTextToUnicode. It returns info like "output
308 // buffer full" or "use substitution" as errors.
309 err = ConvertFromTextToUnicode ( converterInfo, hostLen, hostPtr, kNilOptions,
310 0, 0, 0, 0, kBufferLen, &bytesRead, &bytesWritten, (UniChar*)buffer );
311 if ( bytesRead == 0 ) break; // Make sure forward progress happens.
312 utf8->append ( &buffer[0], bytesWritten );
313 hostPtr += bytesRead;
314 hostLen -= bytesRead;
315 }
316
317 DisposeTextToUnicodeInfo ( &converterInfo );
318
319 } catch ( ... ) {
320
321 DisposeTextToUnicodeInfo ( &converterInfo );
322 throw;
323
324 }
325
326 } // MacEncodingToUTF8
327
328 #elif XMP_UNIXBuild
329
330 // ! Does not exist, must not be called, for Generic UNIX builds. It is not clear at this time
331 // ! what notion of local encoding should be used for generic UNIX, especially in a server product.
332
333 #endif
334
335 // =================================================================================================
336 // ReconcileUtils::LocalToUTF8
337 // ===========================
338
339 #if ! XMP_UNIXBuild
340 // ! Does not exist, must not be called, for Generic UNIX builds. It is not clear at this time
341 // ! what notion of local encoding should be used for generic UNIX, especially in a server product.
342
LocalToUTF8(const void * _localPtr,size_t localLen,std::string * utf8)343 void ReconcileUtils::LocalToUTF8 ( const void * _localPtr, size_t localLen, std::string * utf8 )
344 {
345 const XMP_Uns8* localPtr = (XMP_Uns8*)_localPtr;
346
347 utf8->erase();
348
349 if ( IsASCII ( localPtr, localLen ) ) {
350 utf8->assign ( (const char *)localPtr, localLen );
351 return;
352 }
353
354 #if XMP_WinBuild
355
356 WinEncodingToUTF8 ( CP_ACP, localPtr, localLen, utf8 );
357
358 #elif XMP_MacBuild
359
360 OSStatus err;
361
362 TextEncoding localEncoding;
363 err = UpgradeScriptInfoToTextEncoding ( smSystemScript, kTextLanguageDontCare, kTextRegionDontCare, 0, &localEncoding );
364 if ( err != noErr ) XMP_Throw ( "UpgradeScriptInfoToTextEncoding failed", kXMPErr_ExternalFailure );
365
366 MacEncodingToUTF8 ( localEncoding, localPtr, localLen, utf8 );
367
368 #elif XMP_UNIXBuild
369
370 #error "No generic UNIX implementation"
371
372 #endif
373
374 } // ReconcileUtils::LocalToUTF8
375
376 #endif
377
378 // =================================================================================================
379 // ReconcileUtils::Latin1ToUTF8
380 // ============================
381 //
382 // Actually from the Windows code page 1252 superset of 8859-1.
383
384 #if ! XMP_UNIXBuild
385 // ! Does not exist, must not be called, for Generic UNIX builds. At some point we could consider
386 // ! creating our own private implementation. So far only needed for the ID3 legacy in MP3 files.
387
Latin1ToUTF8(const void * _latin1Ptr,size_t latin1Len,std::string * utf8)388 void ReconcileUtils::Latin1ToUTF8 ( const void * _latin1Ptr, size_t latin1Len, std::string * utf8 )
389 {
390 const XMP_Uns8* latin1Ptr = (XMP_Uns8*)_latin1Ptr;
391
392 utf8->erase();
393
394 if ( IsASCII ( latin1Ptr, latin1Len ) ) {
395 utf8->assign ( (const char *)latin1Ptr, latin1Len );
396 return;
397 }
398
399 #if XMP_WinBuild
400
401 WinEncodingToUTF8 ( 1252, latin1Ptr, latin1Len, utf8 );
402
403 #elif XMP_MacBuild
404
405 TextEncoding latin1Encoding;
406 latin1Encoding = CreateTextEncoding ( kTextEncodingWindowsLatin1,
407 kTextEncodingDefaultVariant, kTextEncodingDefaultFormat );
408
409 MacEncodingToUTF8 ( latin1Encoding, latin1Ptr, latin1Len, utf8 );
410
411 #elif XMP_UNIXBuild
412
413 #error "No generic UNIX implementation"
414
415 #endif
416
417 } // ReconcileUtils::Latin1ToUTF8
418
419 #endif
420