1 // =================================================================================================
2 // ADOBE SYSTEMS INCORPORATED
3 // Copyright 2006-2007 Adobe Systems Incorporated
4 // All Rights Reserved
5 //
6 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
7 // of the Adobe license agreement accompanying it.
8 // =================================================================================================
9 
10 #include "XMP_Environment.h"	// ! This must be the first include.
11 
12 #include "Reconcile_Impl.hpp"
13 
14 #include "UnicodeConversions.hpp"
15 
16 #if XMP_WinBuild
17 #elif XMP_MacBuild
18 	#include "UnicodeConverter.h"
19 #endif
20 
21 // =================================================================================================
22 /// \file Reconcile_Impl.cpp
23 /// \brief Implementation utilities for the legacy metadata reconciliation support.
24 ///
25 // =================================================================================================
26 
27 // =================================================================================================
28 // IsASCII
29 // =======
30 //
31 // See if a string is 7 bit ASCII.
32 
IsASCII(const void * strPtr,size_t strLen)33 static inline bool IsASCII ( const void * strPtr, size_t strLen )
34 {
35 
36 	for ( const XMP_Uns8 * strPos = (XMP_Uns8*)strPtr; strLen > 0; --strLen, ++strPos ) {
37 		if ( *strPos >= 0x80 ) return false;
38 	}
39 
40 	return true;
41 
42 }	// IsASCII
43 
44 // =================================================================================================
45 // ReconcileUtils::IsUTF8
46 // ======================
47 //
48 // See if a string contains valid UTF-8. Allow nul bytes, they can appear inside of multi-part Exif
49 // strings. We don't use CodePoint_from_UTF8_Multi in UnicodeConversions because it throws an
50 // exception for non-Unicode and we don't need to actually compute the code points.
51 
IsUTF8(const void * utf8Ptr,size_t utf8Len)52 bool ReconcileUtils::IsUTF8 ( const void * utf8Ptr, size_t utf8Len )
53 {
54 	const XMP_Uns8 * utf8Pos = (XMP_Uns8*)utf8Ptr;
55 	const XMP_Uns8 * utf8End = utf8Pos + utf8Len;
56 
57 	while ( utf8Pos < utf8End ) {
58 
59 		if ( *utf8Pos < 0x80 ) {
60 
61 			++utf8Pos;	// ASCII is UTF-8, tolerate nuls.
62 
63 		} else {
64 
65 			// -------------------------------------------------------------------------------------
66 			// We've got a multibyte UTF-8 character. The first byte has the number of bytes as the
67 			// number of high order 1 bits. The remaining bytes must have 1 and 0 as the top 2 bits.
68 
69 			#if 0	// *** This might be a more effcient way to count the bytes.
70 				static XMP_Uns8 kByteCounts[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
71 				size_t bytesNeeded = kByteCounts [ *utf8Pos >> 4 ];
72 				if ( (bytesNeeded < 2) || ((bytesNeeded == 4) && ((*utf8Pos & 0x08) != 0)) ) return false;
73 				if ( (utf8Pos + bytesNeeded) > utf8End ) return false;
74 			#endif
75 
76 			size_t bytesNeeded = 0;	// Count the high order 1 bits in the first byte.
77 			for ( XMP_Uns8 temp = *utf8Pos; temp > 0x7F; temp = temp << 1 ) ++bytesNeeded;
78 				// *** Consider CPU-specific assembly inline, e.g. cntlzw on PowerPC.
79 
80 			if ( (bytesNeeded < 2) || (bytesNeeded > 4) || ((utf8Pos+bytesNeeded) > utf8End) ) return false;
81 
82 			for ( --bytesNeeded, ++utf8Pos; bytesNeeded > 0; --bytesNeeded, ++utf8Pos ) {
83 				if ( (*utf8Pos >> 6) != 2 ) return false;
84 			}
85 
86 		}
87 
88 	}
89 
90 	return true;
91 
92 }	// ReconcileUtils::IsUTF8
93 
94 // =================================================================================================
95 // UTF8ToHostEncoding
96 // ==================
97 
98 #if XMP_WinBuild
99 
UTF8ToWinEncoding(UINT codePage,const XMP_Uns8 * utf8Ptr,size_t utf8Len,std::string * host)100 	static void UTF8ToWinEncoding ( UINT codePage,
101 									const XMP_Uns8 * utf8Ptr, size_t utf8Len, std::string * host )
102 	{
103 
104 		std::string utf16;	// WideCharToMultiByte wants native UTF-16.
105 		ToUTF16Native ( (UTF8Unit*)utf8Ptr, utf8Len, &utf16 );
106 
107 		LPCWSTR utf16Ptr = (LPCWSTR) utf16.c_str();
108 		size_t  utf16Len = utf16.size() / 2;
109 
110 		int hostLen = WideCharToMultiByte ( codePage, 0, utf16Ptr, (int)utf16Len, 0, 0, 0, 0 );
111 		host->assign ( hostLen, ' ' );	// Allocate space for the results.
112 
113 		(void) WideCharToMultiByte ( codePage, 0, utf16Ptr, (int)utf16Len, (LPSTR)host->data(), hostLen, 0, 0 );
114 		XMP_Assert ( hostLen == host->size() );
115 
116 	}	// UTF8ToWinEncoding
117 
118 #elif XMP_MacBuild
119 
UTF8ToMacEncoding(TextEncoding & destEncoding,const XMP_Uns8 * utf8Ptr,size_t utf8Len,std::string * host)120 	static void UTF8ToMacEncoding ( TextEncoding & destEncoding,
121 									const XMP_Uns8 * utf8Ptr, size_t utf8Len, std::string * host )
122 	{
123 		OSStatus err;
124 
125 		UnicodeMapping mappingInfo;
126 		mappingInfo.mappingVersion  = kUnicodeUseLatestMapping;
127 		mappingInfo.otherEncoding   = GetTextEncodingBase ( destEncoding );
128 		mappingInfo.unicodeEncoding = CreateTextEncoding ( kTextEncodingUnicodeDefault,
129 														   kUnicodeNoSubset, kUnicodeUTF8Format );
130 
131 		UnicodeToTextInfo converterInfo;
132 		err = CreateUnicodeToTextInfo ( &mappingInfo, &converterInfo );
133 		if ( err != noErr ) XMP_Throw ( "CreateUnicodeToTextInfo failed", kXMPErr_ExternalFailure );
134 
135 		try {	// ! Need to call DisposeUnicodeToTextInfo before exiting.
136 
137 			OptionBits convFlags = kUnicodeUseFallbacksMask |
138 								   kUnicodeLooseMappingsMask |  kUnicodeDefaultDirectionMask;
139 			ByteCount bytesRead, bytesWritten;
140 
141 			enum { kBufferLen = 1000 };	// Ought to be enough in practice, without using too much stack.
142 			char buffer [kBufferLen];
143 
144 			host->reserve ( utf8Len );	// As good a guess as any.
145 
146 			while ( utf8Len > 0 ) {
147 				// Ignore all errors from ConvertFromUnicodeToText. It returns info like "output
148 				// buffer full" or "use substitution" as errors.
149 				err = ConvertFromUnicodeToText ( converterInfo, utf8Len, (UniChar*)utf8Ptr, convFlags,
150 												 0, 0, 0, 0, kBufferLen, &bytesRead, &bytesWritten, buffer );
151 				if ( bytesRead == 0 ) break;	// Make sure forward progress happens.
152 				host->append ( &buffer[0], bytesWritten );
153 				utf8Ptr += bytesRead;
154 				utf8Len -= bytesRead;
155 			}
156 
157 			DisposeUnicodeToTextInfo ( &converterInfo );
158 
159 		} catch ( ... ) {
160 
161 			DisposeUnicodeToTextInfo ( &converterInfo );
162 			throw;
163 
164 		}
165 
166 	}	// UTF8ToMacEncoding
167 
168 #elif XMP_UNIXBuild
169 
170 	// ! Does not exist, must not be called, for Generic UNIX builds. It is not clear at this time
171 	// ! what notion of local encoding should be used for generic UNIX, especially in a server product.
172 
173 #endif
174 
175 // =================================================================================================
176 // ReconcileUtils::UTF8ToLocal
177 // ===========================
178 
179 #if ! XMP_UNIXBuild
180 // ! Does not exist, must not be called, for Generic UNIX builds. It is not clear at this time
181 // ! what notion of local encoding should be used for generic UNIX, especially in a server product.
182 
UTF8ToLocal(const void * _utf8Ptr,size_t utf8Len,std::string * local)183 void ReconcileUtils::UTF8ToLocal ( const void * _utf8Ptr, size_t utf8Len, std::string * local )
184 {
185 	const XMP_Uns8* utf8Ptr = (XMP_Uns8*)_utf8Ptr;
186 
187 	local->erase();
188 
189 	if ( IsASCII ( utf8Ptr, utf8Len ) ) {
190 		local->assign ( (const char *)utf8Ptr, utf8Len );
191 		return;
192 	}
193 
194 	#if XMP_WinBuild
195 
196 		UTF8ToWinEncoding ( CP_ACP, utf8Ptr, utf8Len, local );
197 
198 	#elif XMP_MacBuild
199 
200 		OSStatus err;
201 
202 		TextEncoding localEncoding;
203 		err = UpgradeScriptInfoToTextEncoding ( smSystemScript,
204 												kTextLanguageDontCare, kTextRegionDontCare, 0, &localEncoding );
205 		if ( err != noErr ) XMP_Throw ( "UpgradeScriptInfoToTextEncoding failed", kXMPErr_ExternalFailure );
206 
207 		UTF8ToMacEncoding ( localEncoding, utf8Ptr, utf8Len, local );
208 
209 	#elif XMP_UNIXBuild
210 
211 		#error "No generic UNIX implementation"
212 
213 	#endif
214 
215 }	// ReconcileUtils::UTF8ToLocal
216 
217 #endif
218 
219 // =================================================================================================
220 // ReconcileUtils::UTF8ToLatin1
221 // ============================
222 //
223 // Actually to the Windows code page 1252 superset of 8859-1.
224 
225 #if ! XMP_UNIXBuild
226 // ! Does not exist, must not be called, for Generic UNIX builds. At some point we could consider
227 // ! creating our own private implementation. So far only needed for the ID3 legacy in MP3 files.
228 
UTF8ToLatin1(const void * _utf8Ptr,size_t utf8Len,std::string * latin1)229 void ReconcileUtils::UTF8ToLatin1 ( const void * _utf8Ptr, size_t utf8Len, std::string * latin1 )
230 {
231 	const XMP_Uns8* utf8Ptr = (XMP_Uns8*)_utf8Ptr;
232 
233 	latin1->erase();
234 
235 	if ( IsASCII ( utf8Ptr, utf8Len ) ) {
236 		latin1->assign ( (const char *)utf8Ptr, utf8Len );
237 		return;
238 	}
239 
240 	#if XMP_WinBuild
241 
242 		UTF8ToWinEncoding ( 1252, utf8Ptr, utf8Len, latin1 );
243 
244 	#elif XMP_MacBuild
245 
246 		TextEncoding latin1Encoding;
247 		latin1Encoding = CreateTextEncoding ( kTextEncodingWindowsLatin1,
248 											  kTextEncodingDefaultVariant, kTextEncodingDefaultFormat );
249 
250 		UTF8ToMacEncoding ( latin1Encoding, utf8Ptr, utf8Len, latin1 );
251 
252 	#elif XMP_UNIXBuild
253 
254 		#error "No generic UNIX implementation"
255 
256 	#endif
257 
258 }	// ReconcileUtils::UTF8ToLatin1
259 
260 #endif
261 
262 // =================================================================================================
263 // HostEncodingToUTF8
264 // ==================
265 
266 #if XMP_WinBuild
267 
WinEncodingToUTF8(UINT codePage,const XMP_Uns8 * hostPtr,size_t hostLen,std::string * utf8)268 	static void WinEncodingToUTF8 ( UINT codePage,
269 									const XMP_Uns8 * hostPtr, size_t hostLen, std::string * utf8 )
270 	{
271 
272 		int utf16Len = MultiByteToWideChar ( codePage, 0, (LPCSTR)hostPtr, (int)hostLen, 0, 0 );
273 		std::vector<UTF16Unit> utf16 ( utf16Len, 0 );	// MultiByteToWideChar returns native UTF-16.
274 
275 		(void) MultiByteToWideChar ( codePage, 0, (LPCSTR)hostPtr, (int)hostLen, (LPWSTR)&utf16[0], utf16Len );
276 		FromUTF16Native ( &utf16[0], (int)utf16Len, utf8 );
277 
278 	}	// WinEncodingToUTF8
279 
280 #elif XMP_MacBuild
281 
MacEncodingToUTF8(TextEncoding & srcEncoding,const XMP_Uns8 * hostPtr,size_t hostLen,std::string * utf8)282 	static void MacEncodingToUTF8 ( TextEncoding & srcEncoding,
283 									const XMP_Uns8 * hostPtr, size_t hostLen, std::string * utf8 )
284 	{
285 		OSStatus err;
286 
287 		UnicodeMapping mappingInfo;
288 		mappingInfo.mappingVersion  = kUnicodeUseLatestMapping;
289 		mappingInfo.otherEncoding   = GetTextEncodingBase ( srcEncoding );
290 		mappingInfo.unicodeEncoding = CreateTextEncoding ( kTextEncodingUnicodeDefault,
291 														   kUnicodeNoSubset, kUnicodeUTF8Format );
292 
293 		TextToUnicodeInfo converterInfo;
294 		err = CreateTextToUnicodeInfo ( &mappingInfo, &converterInfo );
295 		if ( err != noErr ) XMP_Throw ( "CreateTextToUnicodeInfo failed", kXMPErr_ExternalFailure );
296 
297 		try {	// ! Need to call DisposeTextToUnicodeInfo before exiting.
298 
299 			ByteCount bytesRead, bytesWritten;
300 
301 			enum { kBufferLen = 1000 };	// Ought to be enough in practice, without using too much stack.
302 			char buffer [kBufferLen];
303 
304 			utf8->reserve ( hostLen );	// As good a guess as any.
305 
306 			while ( hostLen > 0 ) {
307 				// Ignore all errors from ConvertFromTextToUnicode. It returns info like "output
308 				// buffer full" or "use substitution" as errors.
309 				err = ConvertFromTextToUnicode ( converterInfo, hostLen, hostPtr, kNilOptions,
310 												 0, 0, 0, 0, kBufferLen, &bytesRead, &bytesWritten, (UniChar*)buffer );
311 				if ( bytesRead == 0 ) break;	// Make sure forward progress happens.
312 				utf8->append ( &buffer[0], bytesWritten );
313 				hostPtr += bytesRead;
314 				hostLen -= bytesRead;
315 			}
316 
317 			DisposeTextToUnicodeInfo ( &converterInfo );
318 
319 		} catch ( ... ) {
320 
321 			DisposeTextToUnicodeInfo ( &converterInfo );
322 			throw;
323 
324 		}
325 
326 	}	// MacEncodingToUTF8
327 
328 #elif XMP_UNIXBuild
329 
330 	// ! Does not exist, must not be called, for Generic UNIX builds. It is not clear at this time
331 	// ! what notion of local encoding should be used for generic UNIX, especially in a server product.
332 
333 #endif
334 
335 // =================================================================================================
336 // ReconcileUtils::LocalToUTF8
337 // ===========================
338 
339 #if ! XMP_UNIXBuild
340 // ! Does not exist, must not be called, for Generic UNIX builds. It is not clear at this time
341 // ! what notion of local encoding should be used for generic UNIX, especially in a server product.
342 
LocalToUTF8(const void * _localPtr,size_t localLen,std::string * utf8)343 void ReconcileUtils::LocalToUTF8 ( const void * _localPtr, size_t localLen, std::string * utf8 )
344 {
345 	const XMP_Uns8* localPtr = (XMP_Uns8*)_localPtr;
346 
347 	utf8->erase();
348 
349 	if ( IsASCII ( localPtr, localLen ) ) {
350 		utf8->assign ( (const char *)localPtr, localLen );
351 		return;
352 	}
353 
354 	#if XMP_WinBuild
355 
356 		WinEncodingToUTF8 ( CP_ACP, localPtr, localLen, utf8 );
357 
358 	#elif XMP_MacBuild
359 
360 		OSStatus err;
361 
362 		TextEncoding localEncoding;
363 		err = UpgradeScriptInfoToTextEncoding ( smSystemScript, kTextLanguageDontCare, kTextRegionDontCare, 0, &localEncoding );
364 		if ( err != noErr ) XMP_Throw ( "UpgradeScriptInfoToTextEncoding failed", kXMPErr_ExternalFailure );
365 
366 		MacEncodingToUTF8 ( localEncoding, localPtr, localLen, utf8 );
367 
368 	#elif XMP_UNIXBuild
369 
370 		#error "No generic UNIX implementation"
371 
372 	#endif
373 
374 }	// ReconcileUtils::LocalToUTF8
375 
376 #endif
377 
378 // =================================================================================================
379 // ReconcileUtils::Latin1ToUTF8
380 // ============================
381 //
382 // Actually from the Windows code page 1252 superset of 8859-1.
383 
384 #if ! XMP_UNIXBuild
385 // ! Does not exist, must not be called, for Generic UNIX builds. At some point we could consider
386 // ! creating our own private implementation. So far only needed for the ID3 legacy in MP3 files.
387 
Latin1ToUTF8(const void * _latin1Ptr,size_t latin1Len,std::string * utf8)388 void ReconcileUtils::Latin1ToUTF8 ( const void * _latin1Ptr, size_t latin1Len, std::string * utf8 )
389 {
390 	const XMP_Uns8* latin1Ptr = (XMP_Uns8*)_latin1Ptr;
391 
392 	utf8->erase();
393 
394 	if ( IsASCII ( latin1Ptr, latin1Len ) ) {
395 		utf8->assign ( (const char *)latin1Ptr, latin1Len );
396 		return;
397 	}
398 
399 	#if XMP_WinBuild
400 
401 		WinEncodingToUTF8 ( 1252, latin1Ptr, latin1Len, utf8 );
402 
403 	#elif XMP_MacBuild
404 
405 		TextEncoding latin1Encoding;
406 		latin1Encoding = CreateTextEncoding ( kTextEncodingWindowsLatin1,
407 											  kTextEncodingDefaultVariant, kTextEncodingDefaultFormat );
408 
409 		MacEncodingToUTF8 ( latin1Encoding, latin1Ptr, latin1Len, utf8 );
410 
411 	#elif XMP_UNIXBuild
412 
413 		#error "No generic UNIX implementation"
414 
415 	#endif
416 
417 }	// ReconcileUtils::Latin1ToUTF8
418 
419 #endif
420