1 // =================================================================================================
2 // ADOBE SYSTEMS INCORPORATED
3 // Copyright 2006 Adobe Systems Incorporated
4 // All Rights Reserved
5 //
6 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
7 // of the Adobe license agreement accompanying it.
8 // =================================================================================================
9 
10 #include "public/include/XMP_Environment.h"	// ! This must be the first include.
11 #include "public/include/XMP_Const.h"
12 
13 #include "XMPFiles/source/FormatSupport/Reconcile_Impl.hpp"
14 #include "source/UnicodeConversions.hpp"
15 #include "source/XIO.hpp"
16 
17 #if XMP_WinBuild
18 #elif XMP_MacBuild
19 	#include <CoreServices/CoreServices.h>
20 #elif XMP_iOSBuild
21     #include <CoreFoundation/CoreFoundation.h>
22 #endif
23 
24 const char * ReconcileUtils::kHexDigits = "0123456789ABCDEF";
25 
26 // =================================================================================================
27 /// \file Reconcile_Impl.cpp
28 /// \brief Implementation utilities for the photo metadata reconciliation support.
29 ///
30 // =================================================================================================
31 
32 // =================================================================================================
33 // ReconcileUtils::IsASCII
34 // =======================
35 //
36 // See if a string is 7 bit ASCII.
37 
IsASCII(const void * textPtr,size_t textLen)38 bool ReconcileUtils::IsASCII ( const void * textPtr, size_t textLen )
39 {
40 
41 	for ( const XMP_Uns8 * textPos = (XMP_Uns8*)textPtr; textLen > 0; --textLen, ++textPos ) {
42 		if ( *textPos >= 0x80 ) return false;
43 	}
44 
45 	return true;
46 
47 }	// ReconcileUtils::IsASCII
48 
49 // =================================================================================================
50 // ReconcileUtils::IsUTF8
51 // ======================
52 //
53 // See if a string contains valid UTF-8. Allow nul bytes, they can appear inside of multi-part Exif
54 // strings. We don't use CodePoint_from_UTF8_Multi in UnicodeConversions because it throws an
55 // exception for non-Unicode and we don't need to actually compute the code points.
56 
IsUTF8(const void * textPtr,size_t textLen)57 bool ReconcileUtils::IsUTF8 ( const void * textPtr, size_t textLen )
58 {
59 	const XMP_Uns8 * textPos = (XMP_Uns8*)textPtr;
60 	const XMP_Uns8 * textEnd = textPos + textLen;
61 
62 	while ( textPos < textEnd ) {
63 
64 		if ( *textPos < 0x80 ) {
65 
66 			++textPos;	// ASCII is UTF-8, tolerate nuls.
67 
68 		} else {
69 
70 			// -------------------------------------------------------------------------------------
71 			// We've got a multibyte UTF-8 character. The first byte has the number of bytes as the
72 			// number of high order 1 bits. The remaining bytes must have 1 and 0 as the top 2 bits.
73 
74 			#if 0	// *** This might be a more effcient way to count the bytes.
75 				static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
76 				size_t bytesNeeded = kByteCounts [ *textPos >> 4 ];
77 				if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((*textPos & 0x08) != 0)) ) return false;
78 				if ( (textPos + bytesNeeded) > textEnd ) return false;
79 			#endif
80 
81 			size_t bytesNeeded = 0;	// Count the high order 1 bits in the first byte.
82 			for ( XMP_Uns8 temp = *textPos; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
83 				// *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
84 
85 			if ( (bytesNeeded < 2) || (bytesNeeded > 4) || ((textPos+bytesNeeded) > textEnd) ) return false;
86 
87 			for ( --bytesNeeded, ++textPos; bytesNeeded > 0; --bytesNeeded, ++textPos ) {
88 				if ( (*textPos >> 6) != 2 ) return false;
89 			}
90 
91 		}
92 
93 	}
94 
95 	return true;	// ! Returns true for empty strings.
96 
97 }	// ReconcileUtils::IsUTF8
98 
99 // =================================================================================================
100 // UTF8ToHostEncoding
101 // ==================
102 
103 #if XMP_WinBuild
104 
UTF8ToWinEncoding(UINT codePage,const XMP_Uns8 * utf8Ptr,size_t utf8Len,std::string * host)105 	void ReconcileUtils::UTF8ToWinEncoding ( UINT codePage, const XMP_Uns8 * utf8Ptr, size_t utf8Len, std::string * host )
106 	{
107 
108 		std::string utf16;	// WideCharToMultiByte wants native UTF-16.
109 		ToUTF16Native ( (UTF8Unit*)utf8Ptr, utf8Len, &utf16 );
110 
111 		LPCWSTR utf16Ptr = (LPCWSTR) utf16.c_str();
112 		size_t  utf16Len = utf16.size() / 2;
113 
114 		int hostLen = WideCharToMultiByte ( codePage, 0, utf16Ptr, (int)utf16Len, 0, 0, 0, 0 );
115 		host->assign ( hostLen, ' ' );	// Allocate space for the results.
116 
117 		(void) WideCharToMultiByte ( codePage, 0, utf16Ptr, (int)utf16Len, (LPSTR)host->data(), hostLen, 0, 0 );
118 		XMP_Assert ( hostLen == host->size() );
119 
120 	}	// UTF8ToWinEncoding
121 
122 #elif XMP_MacBuild
123 
UTF8ToMacEncoding(XMP_Uns16 macScript,XMP_Uns16 macLang,const XMP_Uns8 * utf8Ptr,size_t utf8Len,std::string * host)124 	void ReconcileUtils::UTF8ToMacEncoding ( XMP_Uns16 macScript, XMP_Uns16 macLang, const XMP_Uns8 * utf8Ptr, size_t utf8Len, std::string * host )
125 	{
126 		OSStatus err;
127 
128 		TextEncoding destEncoding;
129 		if ( macLang == langUnspecified ) macLang = kTextLanguageDontCare;
130 		err = UpgradeScriptInfoToTextEncoding ( macScript, macLang, kTextRegionDontCare, 0, &destEncoding );
131 		if ( err != noErr ) XMP_Throw ( "UpgradeScriptInfoToTextEncoding failed", kXMPErr_ExternalFailure );
132 
133 		UnicodeMapping mappingInfo;
134 		mappingInfo.mappingVersion  = kUnicodeUseLatestMapping;
135 		mappingInfo.otherEncoding   = GetTextEncodingBase ( destEncoding );
136 		mappingInfo.unicodeEncoding = CreateTextEncoding ( kTextEncodingUnicodeDefault,
137 														   kUnicodeNoSubset, kUnicodeUTF8Format );
138 
139 		UnicodeToTextInfo converterInfo;
140 		err = CreateUnicodeToTextInfo ( &mappingInfo, &converterInfo );
141 		if ( err != noErr ) XMP_Throw ( "CreateUnicodeToTextInfo failed", kXMPErr_ExternalFailure );
142 
143 		try {	// ! Need to call DisposeUnicodeToTextInfo before exiting.
144 
145 			OptionBits convFlags = kUnicodeUseFallbacksMask |
146 								   kUnicodeLooseMappingsMask |  kUnicodeDefaultDirectionMask;
147 			ByteCount bytesRead, bytesWritten;
148 
149 			enum { kBufferLen = 1000 };	// Ought to be enough in practice, without using too much stack.
150 			char buffer [kBufferLen];
151 
152 			host->reserve ( utf8Len );	// As good a guess as any.
153 
154 			while ( utf8Len > 0 ) {
155 				// Ignore all errors from ConvertFromUnicodeToText. It returns info like "output
156 				// buffer full" or "use substitution" as errors.
157 				err = ConvertFromUnicodeToText ( converterInfo, utf8Len, (UniChar*)utf8Ptr, convFlags,
158 												 0, 0, 0, 0, kBufferLen, &bytesRead, &bytesWritten, buffer );
159 				if ( bytesRead == 0 ) break;	// Make sure forward progress happens.
160 				host->append ( &buffer[0], bytesWritten );
161 				utf8Ptr += bytesRead;
162 				utf8Len -= bytesRead;
163 			}
164 
165 			DisposeUnicodeToTextInfo ( &converterInfo );
166 
167 		} catch ( ... ) {
168 
169 			DisposeUnicodeToTextInfo ( &converterInfo );
170 			throw;
171 
172 		}
173 
174 	}	// UTF8ToMacEncoding
175 
176 #elif XMP_UNIXBuild
177 
178 	// ! Does not exist, must not be called, for Generic UNIX builds.
179 
180 #endif
181 
182 // =================================================================================================
183 // ReconcileUtils::UTF8ToLocal
184 // ===========================
185 
UTF8ToLocal(const void * _utf8Ptr,size_t utf8Len,std::string * local)186 void ReconcileUtils::UTF8ToLocal ( const void * _utf8Ptr, size_t utf8Len, std::string * local )
187 {
188 	const XMP_Uns8* utf8Ptr = (XMP_Uns8*)_utf8Ptr;
189 
190 	local->erase();
191 
192 	if ( ReconcileUtils::IsASCII ( utf8Ptr, utf8Len ) ) {
193 		local->assign ( (const char *)utf8Ptr, utf8Len );
194 		return;
195 	}
196 
197 	#if XMP_WinBuild
198 
199 		UTF8ToWinEncoding ( CP_ACP, utf8Ptr, utf8Len, local );
200 
201 	#elif XMP_MacBuild
202 
203 		UTF8ToMacEncoding ( smSystemScript, kTextLanguageDontCare, utf8Ptr, utf8Len, local );
204 
205 	#elif XMP_UNIXBuild
206 
207 		XMP_Throw ( "Generic UNIX does not have conversions between local and Unicode", kXMPErr_Unavailable );
208 
209     #elif XMP_iOSBuild
210 
211         IOSConvertEncoding(kCFStringEncodingUTF8, CFStringGetSystemEncoding(), utf8Ptr, utf8Len, local);
212 
213 
214 
215 
216 	#endif
217 
218 }	// ReconcileUtils::UTF8ToLocal
219 
220 // =================================================================================================
221 // ReconcileUtils::UTF8ToLatin1
222 // ============================
223 
UTF8ToLatin1(const void * _utf8Ptr,size_t utf8Len,std::string * latin1)224 void ReconcileUtils::UTF8ToLatin1 ( const void * _utf8Ptr, size_t utf8Len, std::string * latin1 )
225 {
226 	const XMP_Uns8* utf8Ptr = (XMP_Uns8*)_utf8Ptr;
227 	const XMP_Uns8* utf8End = utf8Ptr + utf8Len;
228 
229 	latin1->erase();
230 	latin1->reserve ( utf8Len );	// As good a guess as any, at least enough, exact for ASCII.
231 
232 	bool inBadRun = false;
233 
234 	while ( utf8Ptr < utf8End ) {
235 
236 		if ( *utf8Ptr <= 0x7F ) {
237 
238 			(*latin1) += (char)*utf8Ptr;	// Have an ASCII character.
239 			inBadRun = false;
240 			++utf8Ptr;
241 
242 		} else if ( utf8Ptr == (utf8End - 1) ) {
243 
244 			inBadRun = false;
245 			++utf8Ptr;	// Ignore a bad end to the UTF-8.
246 
247 		} else {
248 
249 			XMP_Assert ( (utf8End - utf8Ptr) >= 2 );
250 			XMP_Uns16 ch16 = GetUns16BE ( utf8Ptr );	// A Latin-1 80..FF is 2 UTF-8 bytes.
251 
252 			if ( (0xC280 <= ch16) && (ch16 <= 0xC2BF) ) {
253 
254 				(*latin1) += (char)(ch16 & 0xFF);	// UTF-8 C280..C2BF are Latin-1 80..BF.
255 				inBadRun = false;
256 				utf8Ptr += 2;
257 
258 			} else if ( (0xC380 <= ch16) && (ch16 <= 0xC3BF) ) {
259 
260 				(*latin1) += (char)((ch16 & 0xFF) + 0x40);	// UTF-8 C380..C3BF are Latin-1 C0..FF.
261 				inBadRun = false;
262 				utf8Ptr += 2;
263 
264 			} else {
265 
266 				if ( ! inBadRun ) {
267 					inBadRun = true;
268 					(*latin1) += "(?)";	// Mark the run of out of scope UTF-8.
269 				}
270 
271 				++utf8Ptr;	// Skip the presumably well-formed UTF-8 character.
272 				while ( (utf8Ptr < utf8End) && ((*utf8Ptr & 0xC0) == 0x80) ) ++utf8Ptr;
273 
274 			}
275 
276 		}
277 
278 	}
279 
280 	XMP_Assert ( utf8Ptr == utf8End );
281 
282 }	// ReconcileUtils::UTF8ToLatin1
283 
284 // =================================================================================================
285 // HostEncodingToUTF8
286 // ==================
287 
288 #if XMP_WinBuild
289 
WinEncodingToUTF8(UINT codePage,const XMP_Uns8 * hostPtr,size_t hostLen,std::string * utf8)290 	void ReconcileUtils::WinEncodingToUTF8 ( UINT codePage, const XMP_Uns8 * hostPtr, size_t hostLen, std::string * utf8 )
291 	{
292 
293 		int utf16Len = MultiByteToWideChar ( codePage, 0, (LPCSTR)hostPtr, (int)hostLen, 0, 0 );
294 		std::vector<UTF16Unit> utf16 ( utf16Len, 0 );	// MultiByteToWideChar returns native UTF-16.
295 
296 		(void) MultiByteToWideChar ( codePage, 0, (LPCSTR)hostPtr, (int)hostLen, (LPWSTR)&utf16[0], utf16Len );
297 		FromUTF16Native ( &utf16[0], (int)utf16Len, utf8 );
298 
299 	}	// WinEncodingToUTF8
300 
301 #elif XMP_MacBuild
302 
MacEncodingToUTF8(XMP_Uns16 macScript,XMP_Uns16 macLang,const XMP_Uns8 * hostPtr,size_t hostLen,std::string * utf8)303 	void ReconcileUtils::MacEncodingToUTF8 ( XMP_Uns16 macScript, XMP_Uns16 macLang, const XMP_Uns8 * hostPtr, size_t hostLen, std::string * utf8 )
304 	{
305 		OSStatus err;
306 
307 		TextEncoding srcEncoding;
308 		if ( macLang == langUnspecified ) macLang = kTextLanguageDontCare;
309 		err = UpgradeScriptInfoToTextEncoding ( macScript, macLang, kTextRegionDontCare, 0, &srcEncoding );
310 		if ( err != noErr ) XMP_Throw ( "UpgradeScriptInfoToTextEncoding failed", kXMPErr_ExternalFailure );
311 
312 		UnicodeMapping mappingInfo;
313 		mappingInfo.mappingVersion  = kUnicodeUseLatestMapping;
314 		mappingInfo.otherEncoding   = GetTextEncodingBase ( srcEncoding );
315 		mappingInfo.unicodeEncoding = CreateTextEncoding ( kTextEncodingUnicodeDefault,
316 														   kUnicodeNoSubset, kUnicodeUTF8Format );
317 
318 		TextToUnicodeInfo converterInfo;
319 		err = CreateTextToUnicodeInfo ( &mappingInfo, &converterInfo );
320 		if ( err != noErr ) XMP_Throw ( "CreateTextToUnicodeInfo failed", kXMPErr_ExternalFailure );
321 
322 		try {	// ! Need to call DisposeTextToUnicodeInfo before exiting.
323 
324 			ByteCount bytesRead, bytesWritten;
325 
326 			enum { kBufferLen = 1000 };	// Ought to be enough in practice, without using too much stack.
327 			char buffer [kBufferLen];
328 
329 			utf8->reserve ( hostLen );	// As good a guess as any.
330 
331 			while ( hostLen > 0 ) {
332 				// Ignore all errors from ConvertFromTextToUnicode. It returns info like "output
333 				// buffer full" or "use substitution" as errors.
334 				err = ConvertFromTextToUnicode ( converterInfo, hostLen, hostPtr, kNilOptions,
335 												 0, 0, 0, 0, kBufferLen, &bytesRead, &bytesWritten, (UniChar*)buffer );
336 				if ( bytesRead == 0 ) break;	// Make sure forward progress happens.
337 				utf8->append ( &buffer[0], bytesWritten );
338 				hostPtr += bytesRead;
339 				hostLen -= bytesRead;
340 			}
341 
342 			DisposeTextToUnicodeInfo ( &converterInfo );
343 
344 		} catch ( ... ) {
345 
346 			DisposeTextToUnicodeInfo ( &converterInfo );
347 			throw;
348 
349 		}
350 
351 	}	// MacEncodingToUTF8
352 
353 #elif XMP_UNIXBuild
354 
355 	// ! Does not exist, must not be called, for Generic UNIX builds.
356 
357 #endif
358 
359 // =================================================================================================
360 // ReconcileUtils::LocalToUTF8
361 // ===========================
362 
LocalToUTF8(const void * _localPtr,size_t localLen,std::string * utf8)363 void ReconcileUtils::LocalToUTF8 ( const void * _localPtr, size_t localLen, std::string * utf8 )
364 {
365 	const XMP_Uns8* localPtr = (XMP_Uns8*)_localPtr;
366 
367 	utf8->erase();
368 
369 	if ( ReconcileUtils::IsASCII ( localPtr, localLen ) ) {
370 		utf8->assign ( (const char *)localPtr, localLen );
371 		return;
372 	}
373 
374 	#if XMP_WinBuild
375 
376 		WinEncodingToUTF8 ( CP_ACP, localPtr, localLen, utf8 );
377 
378 	#elif XMP_MacBuild
379 
380 		MacEncodingToUTF8 ( smSystemScript, kTextLanguageDontCare, localPtr, localLen, utf8 );
381 
382 	#elif XMP_UNIXBuild
383 
384 		XMP_Throw ( "Generic UNIX does not have conversions between local and Unicode", kXMPErr_Unavailable );
385 
386     #elif XMP_iOSBuild
387 
388         IOSConvertEncoding(CFStringGetSystemEncoding(), kCFStringEncodingUTF8, localPtr, localLen, utf8);
389 
390 
391 	#endif
392 
393 }	// ReconcileUtils::LocalToUTF8
394 
395 // =================================================================================================
396 // ReconcileUtils::Latin1ToUTF8
397 // ============================
398 
Latin1ToUTF8(const void * _latin1Ptr,size_t latin1Len,std::string * utf8)399 void ReconcileUtils::Latin1ToUTF8 ( const void * _latin1Ptr, size_t latin1Len, std::string * utf8 )
400 {
401 	const XMP_Uns8* latin1Ptr = (XMP_Uns8*)_latin1Ptr;
402 	const XMP_Uns8* latin1End = latin1Ptr + latin1Len;
403 
404 	utf8->erase();
405 	utf8->reserve ( latin1Len );	// As good a guess as any, exact for ASCII.
406 
407 	for ( ; latin1Ptr < latin1End; ++latin1Ptr ) {
408 
409 		XMP_Uns8 ch8 = *latin1Ptr;
410 
411 		if ( ch8 <= 0x7F ) {
412 			(*utf8) += (char)ch8;	// Have an ASCII character.
413 		} else if ( ch8 <= 0xBF ) {
414 			(*utf8) += 0xC2;	// Latin-1 80..BF are UTF-8 C280..C2BF.
415 			(*utf8) += (char)ch8;
416 		} else {
417 			(*utf8) += 0xC3;	// Latin-1 C0..FF are UTF-8 C380..C3BF.
418 			(*utf8) += (char)(ch8 - 0x40);
419 		}
420 
421 	}
422 
423 }	// ReconcileUtils::Latin1ToUTF8
424 
425 
426 // =================================================================================================
427 // ReconcileUtils::NativeToUTF8
428 // ============================
429 
NativeToUTF8(const std::string & input,std::string & output)430 void ReconcileUtils::NativeToUTF8( const std::string & input, std::string & output )
431 {
432 	output.erase();
433 	// IF it is not UTF-8
434 	if( ! ReconcileUtils::IsUTF8( input.c_str(), input.length() ) )
435 	{
436 		// And ServerMode is not active
437 		if( ! ignoreLocalText )
438 		{
439 			// Convert it to UTF-8
440 			ReconcileUtils::LocalToUTF8( input.c_str(), input.length(), &output );
441 		}
442 	}
443 	else // If it is already UTF-8
444 	{
445 		output = input;
446 	}
447 }	// ReconcileUtils::NativeToUTF8
448 
449 
450 #if XMP_iOSBuild
IOSConvertEncoding(XMP_Uns32 srcEncoding,XMP_Uns32 destEncoding,const XMP_Uns8 * inputPtr,size_t inputLen,std::string * output)451     void ReconcileUtils::IOSConvertEncoding(XMP_Uns32 srcEncoding, XMP_Uns32 destEncoding, const XMP_Uns8 * inputPtr, size_t inputLen, std::string * output)
452     {
453         if(srcEncoding == kCFStringEncodingInvalidId || destEncoding == kCFStringEncodingInvalidId ||
454                 !CFStringIsEncodingAvailable(srcEncoding) || !CFStringIsEncodingAvailable(destEncoding))
455             return;
456         CFStringRef cStrRef = CFStringCreateWithBytesNoCopy(NULL, inputPtr, inputLen, srcEncoding, false, kCFAllocatorNull);
457         if(cStrRef == NULL)
458             return;
459         CFRange inputRange = CFRangeMake(0, CFStringGetLength(cStrRef));
460         const size_t kBufferLen = 1000;
461         while(inputRange.length > 0)
462         {
463             XMP_Uns8 buffer[kBufferLen];
464             CFIndex charsWritten;
465             CFIndex charsProcessed = CFStringGetBytes(cStrRef, inputRange, destEncoding, 0, FALSE, buffer, kBufferLen, &charsWritten);
466             if (charsProcessed == 0) break;
467             output->append(reinterpret_cast<const char*>(&buffer[0]), charsWritten);
468             inputRange.location += charsProcessed;
469             inputRange.length -= charsProcessed;
470         }
471         CFRelease(cStrRef);
472     }
473 #endif
474