1 // =================================================================================================
2 // ADOBE SYSTEMS INCORPORATED
3 // Copyright 2006 Adobe Systems Incorporated
4 // All Rights Reserved
5 //
6 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
7 // of the Adobe license agreement accompanying it.
8 // =================================================================================================
9
10 #include "public/include/XMP_Environment.h" // ! This must be the first include.
11 #include "public/include/XMP_Const.h"
12
13 #include "XMPFiles/source/FormatSupport/Reconcile_Impl.hpp"
14 #include "source/UnicodeConversions.hpp"
15 #include "source/XIO.hpp"
16
17 #if XMP_WinBuild
18 #elif XMP_MacBuild
19 #include <CoreServices/CoreServices.h>
20 #elif XMP_iOSBuild
21 #include <CoreFoundation/CoreFoundation.h>
22 #endif
23
24 const char * ReconcileUtils::kHexDigits = "0123456789ABCDEF";
25
26 // =================================================================================================
27 /// \file Reconcile_Impl.cpp
28 /// \brief Implementation utilities for the photo metadata reconciliation support.
29 ///
30 // =================================================================================================
31
32 // =================================================================================================
33 // ReconcileUtils::IsASCII
34 // =======================
35 //
36 // See if a string is 7 bit ASCII.
37
IsASCII(const void * textPtr,size_t textLen)38 bool ReconcileUtils::IsASCII ( const void * textPtr, size_t textLen )
39 {
40
41 for ( const XMP_Uns8 * textPos = (XMP_Uns8*)textPtr; textLen > 0; --textLen, ++textPos ) {
42 if ( *textPos >= 0x80 ) return false;
43 }
44
45 return true;
46
47 } // ReconcileUtils::IsASCII
48
49 // =================================================================================================
50 // ReconcileUtils::IsUTF8
51 // ======================
52 //
53 // See if a string contains valid UTF-8. Allow nul bytes, they can appear inside of multi-part Exif
54 // strings. We don't use CodePoint_from_UTF8_Multi in UnicodeConversions because it throws an
55 // exception for non-Unicode and we don't need to actually compute the code points.
56
IsUTF8(const void * textPtr,size_t textLen)57 bool ReconcileUtils::IsUTF8 ( const void * textPtr, size_t textLen )
58 {
59 const XMP_Uns8 * textPos = (XMP_Uns8*)textPtr;
60 const XMP_Uns8 * textEnd = textPos + textLen;
61
62 while ( textPos < textEnd ) {
63
64 if ( *textPos < 0x80 ) {
65
66 ++textPos; // ASCII is UTF-8, tolerate nuls.
67
68 } else {
69
70 // -------------------------------------------------------------------------------------
71 // We've got a multibyte UTF-8 character. The first byte has the number of bytes as the
72 // number of high order 1 bits. The remaining bytes must have 1 and 0 as the top 2 bits.
73
74 #if 0 // *** This might be a more effcient way to count the bytes.
75 static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
76 size_t bytesNeeded = kByteCounts [ *textPos >> 4 ];
77 if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((*textPos & 0x08) != 0)) ) return false;
78 if ( (textPos + bytesNeeded) > textEnd ) return false;
79 #endif
80
81 size_t bytesNeeded = 0; // Count the high order 1 bits in the first byte.
82 for ( XMP_Uns8 temp = *textPos; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
83 // *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
84
85 if ( (bytesNeeded < 2) || (bytesNeeded > 4) || ((textPos+bytesNeeded) > textEnd) ) return false;
86
87 for ( --bytesNeeded, ++textPos; bytesNeeded > 0; --bytesNeeded, ++textPos ) {
88 if ( (*textPos >> 6) != 2 ) return false;
89 }
90
91 }
92
93 }
94
95 return true; // ! Returns true for empty strings.
96
97 } // ReconcileUtils::IsUTF8
98
99 // =================================================================================================
100 // UTF8ToHostEncoding
101 // ==================
102
103 #if XMP_WinBuild
104
UTF8ToWinEncoding(UINT codePage,const XMP_Uns8 * utf8Ptr,size_t utf8Len,std::string * host)105 void ReconcileUtils::UTF8ToWinEncoding ( UINT codePage, const XMP_Uns8 * utf8Ptr, size_t utf8Len, std::string * host )
106 {
107
108 std::string utf16; // WideCharToMultiByte wants native UTF-16.
109 ToUTF16Native ( (UTF8Unit*)utf8Ptr, utf8Len, &utf16 );
110
111 LPCWSTR utf16Ptr = (LPCWSTR) utf16.c_str();
112 size_t utf16Len = utf16.size() / 2;
113
114 int hostLen = WideCharToMultiByte ( codePage, 0, utf16Ptr, (int)utf16Len, 0, 0, 0, 0 );
115 host->assign ( hostLen, ' ' ); // Allocate space for the results.
116
117 (void) WideCharToMultiByte ( codePage, 0, utf16Ptr, (int)utf16Len, (LPSTR)host->data(), hostLen, 0, 0 );
118 XMP_Assert ( hostLen == host->size() );
119
120 } // UTF8ToWinEncoding
121
122 #elif XMP_MacBuild
123
UTF8ToMacEncoding(XMP_Uns16 macScript,XMP_Uns16 macLang,const XMP_Uns8 * utf8Ptr,size_t utf8Len,std::string * host)124 void ReconcileUtils::UTF8ToMacEncoding ( XMP_Uns16 macScript, XMP_Uns16 macLang, const XMP_Uns8 * utf8Ptr, size_t utf8Len, std::string * host )
125 {
126 OSStatus err;
127
128 TextEncoding destEncoding;
129 if ( macLang == langUnspecified ) macLang = kTextLanguageDontCare;
130 err = UpgradeScriptInfoToTextEncoding ( macScript, macLang, kTextRegionDontCare, 0, &destEncoding );
131 if ( err != noErr ) XMP_Throw ( "UpgradeScriptInfoToTextEncoding failed", kXMPErr_ExternalFailure );
132
133 UnicodeMapping mappingInfo;
134 mappingInfo.mappingVersion = kUnicodeUseLatestMapping;
135 mappingInfo.otherEncoding = GetTextEncodingBase ( destEncoding );
136 mappingInfo.unicodeEncoding = CreateTextEncoding ( kTextEncodingUnicodeDefault,
137 kUnicodeNoSubset, kUnicodeUTF8Format );
138
139 UnicodeToTextInfo converterInfo;
140 err = CreateUnicodeToTextInfo ( &mappingInfo, &converterInfo );
141 if ( err != noErr ) XMP_Throw ( "CreateUnicodeToTextInfo failed", kXMPErr_ExternalFailure );
142
143 try { // ! Need to call DisposeUnicodeToTextInfo before exiting.
144
145 OptionBits convFlags = kUnicodeUseFallbacksMask |
146 kUnicodeLooseMappingsMask | kUnicodeDefaultDirectionMask;
147 ByteCount bytesRead, bytesWritten;
148
149 enum { kBufferLen = 1000 }; // Ought to be enough in practice, without using too much stack.
150 char buffer [kBufferLen];
151
152 host->reserve ( utf8Len ); // As good a guess as any.
153
154 while ( utf8Len > 0 ) {
155 // Ignore all errors from ConvertFromUnicodeToText. It returns info like "output
156 // buffer full" or "use substitution" as errors.
157 err = ConvertFromUnicodeToText ( converterInfo, utf8Len, (UniChar*)utf8Ptr, convFlags,
158 0, 0, 0, 0, kBufferLen, &bytesRead, &bytesWritten, buffer );
159 if ( bytesRead == 0 ) break; // Make sure forward progress happens.
160 host->append ( &buffer[0], bytesWritten );
161 utf8Ptr += bytesRead;
162 utf8Len -= bytesRead;
163 }
164
165 DisposeUnicodeToTextInfo ( &converterInfo );
166
167 } catch ( ... ) {
168
169 DisposeUnicodeToTextInfo ( &converterInfo );
170 throw;
171
172 }
173
174 } // UTF8ToMacEncoding
175
176 #elif XMP_UNIXBuild
177
178 // ! Does not exist, must not be called, for Generic UNIX builds.
179
180 #endif
181
182 // =================================================================================================
183 // ReconcileUtils::UTF8ToLocal
184 // ===========================
185
UTF8ToLocal(const void * _utf8Ptr,size_t utf8Len,std::string * local)186 void ReconcileUtils::UTF8ToLocal ( const void * _utf8Ptr, size_t utf8Len, std::string * local )
187 {
188 const XMP_Uns8* utf8Ptr = (XMP_Uns8*)_utf8Ptr;
189
190 local->erase();
191
192 if ( ReconcileUtils::IsASCII ( utf8Ptr, utf8Len ) ) {
193 local->assign ( (const char *)utf8Ptr, utf8Len );
194 return;
195 }
196
197 #if XMP_WinBuild
198
199 UTF8ToWinEncoding ( CP_ACP, utf8Ptr, utf8Len, local );
200
201 #elif XMP_MacBuild
202
203 UTF8ToMacEncoding ( smSystemScript, kTextLanguageDontCare, utf8Ptr, utf8Len, local );
204
205 #elif XMP_UNIXBuild
206
207 XMP_Throw ( "Generic UNIX does not have conversions between local and Unicode", kXMPErr_Unavailable );
208
209 #elif XMP_iOSBuild
210
211 IOSConvertEncoding(kCFStringEncodingUTF8, CFStringGetSystemEncoding(), utf8Ptr, utf8Len, local);
212
213
214
215
216 #endif
217
218 } // ReconcileUtils::UTF8ToLocal
219
220 // =================================================================================================
221 // ReconcileUtils::UTF8ToLatin1
222 // ============================
223
UTF8ToLatin1(const void * _utf8Ptr,size_t utf8Len,std::string * latin1)224 void ReconcileUtils::UTF8ToLatin1 ( const void * _utf8Ptr, size_t utf8Len, std::string * latin1 )
225 {
226 const XMP_Uns8* utf8Ptr = (XMP_Uns8*)_utf8Ptr;
227 const XMP_Uns8* utf8End = utf8Ptr + utf8Len;
228
229 latin1->erase();
230 latin1->reserve ( utf8Len ); // As good a guess as any, at least enough, exact for ASCII.
231
232 bool inBadRun = false;
233
234 while ( utf8Ptr < utf8End ) {
235
236 if ( *utf8Ptr <= 0x7F ) {
237
238 (*latin1) += (char)*utf8Ptr; // Have an ASCII character.
239 inBadRun = false;
240 ++utf8Ptr;
241
242 } else if ( utf8Ptr == (utf8End - 1) ) {
243
244 inBadRun = false;
245 ++utf8Ptr; // Ignore a bad end to the UTF-8.
246
247 } else {
248
249 XMP_Assert ( (utf8End - utf8Ptr) >= 2 );
250 XMP_Uns16 ch16 = GetUns16BE ( utf8Ptr ); // A Latin-1 80..FF is 2 UTF-8 bytes.
251
252 if ( (0xC280 <= ch16) && (ch16 <= 0xC2BF) ) {
253
254 (*latin1) += (char)(ch16 & 0xFF); // UTF-8 C280..C2BF are Latin-1 80..BF.
255 inBadRun = false;
256 utf8Ptr += 2;
257
258 } else if ( (0xC380 <= ch16) && (ch16 <= 0xC3BF) ) {
259
260 (*latin1) += (char)((ch16 & 0xFF) + 0x40); // UTF-8 C380..C3BF are Latin-1 C0..FF.
261 inBadRun = false;
262 utf8Ptr += 2;
263
264 } else {
265
266 if ( ! inBadRun ) {
267 inBadRun = true;
268 (*latin1) += "(?)"; // Mark the run of out of scope UTF-8.
269 }
270
271 ++utf8Ptr; // Skip the presumably well-formed UTF-8 character.
272 while ( (utf8Ptr < utf8End) && ((*utf8Ptr & 0xC0) == 0x80) ) ++utf8Ptr;
273
274 }
275
276 }
277
278 }
279
280 XMP_Assert ( utf8Ptr == utf8End );
281
282 } // ReconcileUtils::UTF8ToLatin1
283
284 // =================================================================================================
285 // HostEncodingToUTF8
286 // ==================
287
288 #if XMP_WinBuild
289
WinEncodingToUTF8(UINT codePage,const XMP_Uns8 * hostPtr,size_t hostLen,std::string * utf8)290 void ReconcileUtils::WinEncodingToUTF8 ( UINT codePage, const XMP_Uns8 * hostPtr, size_t hostLen, std::string * utf8 )
291 {
292
293 int utf16Len = MultiByteToWideChar ( codePage, 0, (LPCSTR)hostPtr, (int)hostLen, 0, 0 );
294 std::vector<UTF16Unit> utf16 ( utf16Len, 0 ); // MultiByteToWideChar returns native UTF-16.
295
296 (void) MultiByteToWideChar ( codePage, 0, (LPCSTR)hostPtr, (int)hostLen, (LPWSTR)&utf16[0], utf16Len );
297 FromUTF16Native ( &utf16[0], (int)utf16Len, utf8 );
298
299 } // WinEncodingToUTF8
300
301 #elif XMP_MacBuild
302
MacEncodingToUTF8(XMP_Uns16 macScript,XMP_Uns16 macLang,const XMP_Uns8 * hostPtr,size_t hostLen,std::string * utf8)303 void ReconcileUtils::MacEncodingToUTF8 ( XMP_Uns16 macScript, XMP_Uns16 macLang, const XMP_Uns8 * hostPtr, size_t hostLen, std::string * utf8 )
304 {
305 OSStatus err;
306
307 TextEncoding srcEncoding;
308 if ( macLang == langUnspecified ) macLang = kTextLanguageDontCare;
309 err = UpgradeScriptInfoToTextEncoding ( macScript, macLang, kTextRegionDontCare, 0, &srcEncoding );
310 if ( err != noErr ) XMP_Throw ( "UpgradeScriptInfoToTextEncoding failed", kXMPErr_ExternalFailure );
311
312 UnicodeMapping mappingInfo;
313 mappingInfo.mappingVersion = kUnicodeUseLatestMapping;
314 mappingInfo.otherEncoding = GetTextEncodingBase ( srcEncoding );
315 mappingInfo.unicodeEncoding = CreateTextEncoding ( kTextEncodingUnicodeDefault,
316 kUnicodeNoSubset, kUnicodeUTF8Format );
317
318 TextToUnicodeInfo converterInfo;
319 err = CreateTextToUnicodeInfo ( &mappingInfo, &converterInfo );
320 if ( err != noErr ) XMP_Throw ( "CreateTextToUnicodeInfo failed", kXMPErr_ExternalFailure );
321
322 try { // ! Need to call DisposeTextToUnicodeInfo before exiting.
323
324 ByteCount bytesRead, bytesWritten;
325
326 enum { kBufferLen = 1000 }; // Ought to be enough in practice, without using too much stack.
327 char buffer [kBufferLen];
328
329 utf8->reserve ( hostLen ); // As good a guess as any.
330
331 while ( hostLen > 0 ) {
332 // Ignore all errors from ConvertFromTextToUnicode. It returns info like "output
333 // buffer full" or "use substitution" as errors.
334 err = ConvertFromTextToUnicode ( converterInfo, hostLen, hostPtr, kNilOptions,
335 0, 0, 0, 0, kBufferLen, &bytesRead, &bytesWritten, (UniChar*)buffer );
336 if ( bytesRead == 0 ) break; // Make sure forward progress happens.
337 utf8->append ( &buffer[0], bytesWritten );
338 hostPtr += bytesRead;
339 hostLen -= bytesRead;
340 }
341
342 DisposeTextToUnicodeInfo ( &converterInfo );
343
344 } catch ( ... ) {
345
346 DisposeTextToUnicodeInfo ( &converterInfo );
347 throw;
348
349 }
350
351 } // MacEncodingToUTF8
352
353 #elif XMP_UNIXBuild
354
355 // ! Does not exist, must not be called, for Generic UNIX builds.
356
357 #endif
358
359 // =================================================================================================
360 // ReconcileUtils::LocalToUTF8
361 // ===========================
362
LocalToUTF8(const void * _localPtr,size_t localLen,std::string * utf8)363 void ReconcileUtils::LocalToUTF8 ( const void * _localPtr, size_t localLen, std::string * utf8 )
364 {
365 const XMP_Uns8* localPtr = (XMP_Uns8*)_localPtr;
366
367 utf8->erase();
368
369 if ( ReconcileUtils::IsASCII ( localPtr, localLen ) ) {
370 utf8->assign ( (const char *)localPtr, localLen );
371 return;
372 }
373
374 #if XMP_WinBuild
375
376 WinEncodingToUTF8 ( CP_ACP, localPtr, localLen, utf8 );
377
378 #elif XMP_MacBuild
379
380 MacEncodingToUTF8 ( smSystemScript, kTextLanguageDontCare, localPtr, localLen, utf8 );
381
382 #elif XMP_UNIXBuild
383
384 XMP_Throw ( "Generic UNIX does not have conversions between local and Unicode", kXMPErr_Unavailable );
385
386 #elif XMP_iOSBuild
387
388 IOSConvertEncoding(CFStringGetSystemEncoding(), kCFStringEncodingUTF8, localPtr, localLen, utf8);
389
390
391 #endif
392
393 } // ReconcileUtils::LocalToUTF8
394
395 // =================================================================================================
396 // ReconcileUtils::Latin1ToUTF8
397 // ============================
398
Latin1ToUTF8(const void * _latin1Ptr,size_t latin1Len,std::string * utf8)399 void ReconcileUtils::Latin1ToUTF8 ( const void * _latin1Ptr, size_t latin1Len, std::string * utf8 )
400 {
401 const XMP_Uns8* latin1Ptr = (XMP_Uns8*)_latin1Ptr;
402 const XMP_Uns8* latin1End = latin1Ptr + latin1Len;
403
404 utf8->erase();
405 utf8->reserve ( latin1Len ); // As good a guess as any, exact for ASCII.
406
407 for ( ; latin1Ptr < latin1End; ++latin1Ptr ) {
408
409 XMP_Uns8 ch8 = *latin1Ptr;
410
411 if ( ch8 <= 0x7F ) {
412 (*utf8) += (char)ch8; // Have an ASCII character.
413 } else if ( ch8 <= 0xBF ) {
414 (*utf8) += 0xC2; // Latin-1 80..BF are UTF-8 C280..C2BF.
415 (*utf8) += (char)ch8;
416 } else {
417 (*utf8) += 0xC3; // Latin-1 C0..FF are UTF-8 C380..C3BF.
418 (*utf8) += (char)(ch8 - 0x40);
419 }
420
421 }
422
423 } // ReconcileUtils::Latin1ToUTF8
424
425
426 // =================================================================================================
427 // ReconcileUtils::NativeToUTF8
428 // ============================
429
NativeToUTF8(const std::string & input,std::string & output)430 void ReconcileUtils::NativeToUTF8( const std::string & input, std::string & output )
431 {
432 output.erase();
433 // IF it is not UTF-8
434 if( ! ReconcileUtils::IsUTF8( input.c_str(), input.length() ) )
435 {
436 // And ServerMode is not active
437 if( ! ignoreLocalText )
438 {
439 // Convert it to UTF-8
440 ReconcileUtils::LocalToUTF8( input.c_str(), input.length(), &output );
441 }
442 }
443 else // If it is already UTF-8
444 {
445 output = input;
446 }
447 } // ReconcileUtils::NativeToUTF8
448
449
450 #if XMP_iOSBuild
IOSConvertEncoding(XMP_Uns32 srcEncoding,XMP_Uns32 destEncoding,const XMP_Uns8 * inputPtr,size_t inputLen,std::string * output)451 void ReconcileUtils::IOSConvertEncoding(XMP_Uns32 srcEncoding, XMP_Uns32 destEncoding, const XMP_Uns8 * inputPtr, size_t inputLen, std::string * output)
452 {
453 if(srcEncoding == kCFStringEncodingInvalidId || destEncoding == kCFStringEncodingInvalidId ||
454 !CFStringIsEncodingAvailable(srcEncoding) || !CFStringIsEncodingAvailable(destEncoding))
455 return;
456 CFStringRef cStrRef = CFStringCreateWithBytesNoCopy(NULL, inputPtr, inputLen, srcEncoding, false, kCFAllocatorNull);
457 if(cStrRef == NULL)
458 return;
459 CFRange inputRange = CFRangeMake(0, CFStringGetLength(cStrRef));
460 const size_t kBufferLen = 1000;
461 while(inputRange.length > 0)
462 {
463 XMP_Uns8 buffer[kBufferLen];
464 CFIndex charsWritten;
465 CFIndex charsProcessed = CFStringGetBytes(cStrRef, inputRange, destEncoding, 0, FALSE, buffer, kBufferLen, &charsWritten);
466 if (charsProcessed == 0) break;
467 output->append(reinterpret_cast<const char*>(&buffer[0]), charsWritten);
468 inputRange.location += charsProcessed;
469 inputRange.length -= charsProcessed;
470 }
471 CFRelease(cStrRef);
472 }
473 #endif
474