1 /*
2  * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved.
3  * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextCodecMac.h"
29 
30 #include "CharsetData.h"
31 #include "PlatformString.h"
32 #include "ThreadGlobalData.h"
33 #include <wtf/Assertions.h>
34 #include <wtf/PassOwnPtr.h>
35 #include <wtf/RetainPtr.h>
36 #include <wtf/Threading.h>
37 #include <wtf/text/CString.h>
38 #include <wtf/unicode/CharacterNames.h>
39 
40 using namespace std;
41 
42 namespace WebCore {
43 
44 // We need to keep this because ICU doesn't support some of the encodings that we need:
45 // <http://bugs.webkit.org/show_bug.cgi?id=4195>.
46 
47 const size_t ConversionBufferSize = 16384;
48 
cachedConverterTEC()49 static TECConverterWrapper& cachedConverterTEC()
50 {
51     return threadGlobalData().cachedConverterTEC();
52 }
53 
registerEncodingNames(EncodingNameRegistrar registrar)54 void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar)
55 {
56     TECTextEncodingID lastEncoding = invalidEncoding;
57     const char* lastName = 0;
58 
59     for (size_t i = 0; CharsetTable[i].name; ++i) {
60         if (CharsetTable[i].encoding != lastEncoding) {
61             lastEncoding = CharsetTable[i].encoding;
62             lastName = CharsetTable[i].name;
63         }
64         registrar(CharsetTable[i].name, lastName);
65     }
66 }
67 
newTextCodecMac(const TextEncoding &,const void * additionalData)68 static PassOwnPtr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData)
69 {
70     return adoptPtr(new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData)));
71 }
72 
registerCodecs(TextCodecRegistrar registrar)73 void TextCodecMac::registerCodecs(TextCodecRegistrar registrar)
74 {
75     TECTextEncodingID lastEncoding = invalidEncoding;
76 
77     for (size_t i = 0; CharsetTable[i].name; ++i)
78         if (CharsetTable[i].encoding != lastEncoding) {
79             registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding);
80             lastEncoding = CharsetTable[i].encoding;
81         }
82 }
83 
TextCodecMac(TECTextEncodingID encoding)84 TextCodecMac::TextCodecMac(TECTextEncodingID encoding)
85     : m_encoding(encoding)
86     , m_numBufferedBytes(0)
87     , m_converterTEC(0)
88 {
89 }
90 
~TextCodecMac()91 TextCodecMac::~TextCodecMac()
92 {
93     releaseTECConverter();
94 }
95 
releaseTECConverter() const96 void TextCodecMac::releaseTECConverter() const
97 {
98     if (m_converterTEC) {
99         TECConverterWrapper& cachedConverter = cachedConverterTEC();
100         if (cachedConverter.converter)
101             TECDisposeConverter(cachedConverter.converter);
102         cachedConverter.converter = m_converterTEC;
103         cachedConverter.encoding = m_encoding;
104         m_converterTEC = 0;
105     }
106 }
107 
createTECConverter() const108 OSStatus TextCodecMac::createTECConverter() const
109 {
110     TECConverterWrapper& cachedConverter = cachedConverterTEC();
111 
112     bool cachedEncodingEqual = cachedConverter.encoding == m_encoding;
113     cachedConverter.encoding = invalidEncoding;
114 
115     if (cachedEncodingEqual && cachedConverter.converter) {
116         m_converterTEC = cachedConverter.converter;
117         cachedConverter.converter = 0;
118 
119         TECClearConverterContextInfo(m_converterTEC);
120     } else {
121         OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
122             CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
123         if (status)
124             return status;
125 
126         TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
127     }
128 
129     return noErr;
130 }
131 
decode(const unsigned char * inputBuffer,int inputBufferLength,int & inputLength,void * outputBuffer,int outputBufferLength,int & outputLength)132 OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
133     void *outputBuffer, int outputBufferLength, int& outputLength)
134 {
135     OSStatus status;
136     unsigned long bytesRead = 0;
137     unsigned long bytesWritten = 0;
138 
139     if (m_numBufferedBytes != 0) {
140         // Finish converting a partial character that's in our buffer.
141 
142         // First, fill the partial character buffer with as many bytes as are available.
143         ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes));
144         const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
145         const int bytesToPutInBuffer = min(spaceInBuffer, inputBufferLength);
146         ASSERT(bytesToPutInBuffer != 0);
147         memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);
148 
149         // Now, do a conversion on the buffer.
150         status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
151             reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
152         ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);
153 
154         if (status == kTECPartialCharErr && bytesRead == 0) {
155             // Handle the case where the partial character was not converted.
156             if (bytesToPutInBuffer >= spaceInBuffer) {
157                 LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes));
158                 m_numBufferedBytes = 0;
159                 status = kTECUnmappableElementErr; // should never happen, but use this error code
160             } else {
161                 // Tell the caller we read all the source bytes and keep them in the buffer.
162                 m_numBufferedBytes += bytesToPutInBuffer;
163                 bytesRead = bytesToPutInBuffer;
164                 status = noErr;
165             }
166         } else {
167             // We are done with the partial character buffer.
168             // Also, we have read some of the bytes from the main buffer.
169             if (bytesRead > m_numBufferedBytes) {
170                 bytesRead -= m_numBufferedBytes;
171             } else {
172                 LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
173                 bytesRead = 0;
174             }
175             m_numBufferedBytes = 0;
176             if (status == kTECPartialCharErr) {
177                 // While there may be a partial character problem in the small buffer,
178                 // we have to try again and not get confused and think there is a partial
179                 // character problem in the large buffer.
180                 status = noErr;
181             }
182         }
183     } else {
184         status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
185             static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
186         ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
187     }
188 
189     // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
190     if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0)
191         status = kTECOutputBufferFullStatus;
192 
193     inputLength = bytesRead;
194     outputLength = bytesWritten;
195     return status;
196 }
197 
decode(const char * bytes,size_t length,bool flush,bool stopOnError,bool & sawError)198 String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
199 {
200     // Get a converter for the passed-in encoding.
201     if (!m_converterTEC && createTECConverter() != noErr)
202         return String();
203 
204     Vector<UChar> result;
205 
206     const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
207     int sourceLength = length;
208     bool bufferWasFull = false;
209     UniChar buffer[ConversionBufferSize];
210 
211     while ((sourceLength || bufferWasFull) && !sawError) {
212         int bytesRead = 0;
213         int bytesWritten = 0;
214         OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
215         ASSERT(bytesRead <= sourceLength);
216         sourcePointer += bytesRead;
217         sourceLength -= bytesRead;
218 
219         switch (status) {
220             case noErr:
221             case kTECOutputBufferFullStatus:
222                 break;
223             case kTextMalformedInputErr:
224             case kTextUndefinedElementErr:
225                 // FIXME: Put FFFD character into the output string in this case?
226                 TECClearConverterContextInfo(m_converterTEC);
227                 if (stopOnError) {
228                     sawError = true;
229                     break;
230                 }
231                 if (sourceLength) {
232                     sourcePointer += 1;
233                     sourceLength -= 1;
234                 }
235                 break;
236             case kTECPartialCharErr: {
237                 // Put the partial character into the buffer.
238                 ASSERT(m_numBufferedBytes == 0);
239                 const int bufferSize = sizeof(m_numBufferedBytes);
240                 if (sourceLength < bufferSize) {
241                     memcpy(m_bufferedBytes, sourcePointer, sourceLength);
242                     m_numBufferedBytes = sourceLength;
243                 } else {
244                     LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
245                 }
246                 sourceLength = 0;
247                 break;
248             }
249             default:
250                 sawError = true;
251                 return String();
252         }
253 
254         ASSERT(!(bytesWritten % sizeof(UChar)));
255         result.append(buffer, bytesWritten / sizeof(UChar));
256 
257         bufferWasFull = status == kTECOutputBufferFullStatus;
258     }
259 
260     if (flush) {
261         unsigned long bytesWritten = 0;
262         TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
263         ASSERT(!(bytesWritten % sizeof(UChar)));
264         result.append(buffer, bytesWritten / sizeof(UChar));
265     }
266 
267     String resultString = String::adopt(result);
268 
269     // <rdar://problem/3225472>
270     // Simplified Chinese pages use the code A3A0 to mean "full-width space".
271     // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
272     // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
273     if (m_encoding == kCFStringEncodingGB_18030_2000)
274         resultString.replace(0xE5E5, ideographicSpace);
275 
276     return resultString;
277 }
278 
encode(const UChar * characters,size_t length,UnencodableHandling handling)279 CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling)
280 {
281     // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
282 
283     // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
284     // Encoding will change the yen sign back into a backslash.
285     String copy(characters, length);
286     copy.replace('\\', m_backslashAsCurrencySymbol);
287     RetainPtr<CFStringRef> cfs(AdoptCF, copy.createCFString());
288 
289     CFIndex startPos = 0;
290     CFIndex charactersLeft = CFStringGetLength(cfs.get());
291     Vector<char> result;
292     size_t size = 0;
293     UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0;
294     while (charactersLeft > 0) {
295         CFRange range = CFRangeMake(startPos, charactersLeft);
296         CFIndex bufferLength;
297         CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength);
298 
299         result.grow(size + bufferLength);
300         unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size);
301         CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength);
302         size += bufferLength;
303 
304         if (charactersConverted != charactersLeft) {
305             unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
306             ++charactersConverted;
307             if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate
308                 UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
309                 if ((low & 0xFC00) == 0xDC00) { // is low surrogate
310                     badChar <<= 10;
311                     badChar += low;
312                     badChar += 0x10000 - (0xD800 << 10) - 0xDC00;
313                     ++charactersConverted;
314                 }
315             }
316             UnencodableReplacementArray entity;
317             int entityLength = getUnencodableReplacement(badChar, handling, entity);
318             result.grow(size + entityLength);
319             memcpy(result.data() + size, entity, entityLength);
320             size += entityLength;
321         }
322 
323         startPos += charactersConverted;
324         charactersLeft -= charactersConverted;
325     }
326     return CString(result.data(), size);
327 }
328 
329 } // namespace WebCore
330