1 /*
2  * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3  * Copyright (C) 2007-2009 Torch Mobile, Inc.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextEncodingRegistry.h"
29 
30 #include "TextCodecLatin1.h"
31 #include "TextCodecUserDefined.h"
32 #include "TextCodecUTF16.h"
33 #include "TextCodecUTF8.h"
34 #include "TextEncoding.h"
35 #include <wtf/ASCIICType.h>
36 #include <wtf/HashMap.h>
37 #include <wtf/HashSet.h>
38 #include <wtf/StdLibExtras.h>
39 #include <wtf/StringExtras.h>
40 #include <wtf/Threading.h>
41 
42 #if USE(ICU_UNICODE)
43 #include "TextCodecICU.h"
44 #endif
45 #if PLATFORM(MAC)
46 #include "TextCodecMac.h"
47 #endif
48 #if PLATFORM(QT)
49 #include "qt/TextCodecQt.h"
50 #endif
51 #if USE(GLIB_UNICODE)
52 #include "gtk/TextCodecGtk.h"
53 #endif
54 #if USE(BREWMP_UNICODE)
55 #include "brew/TextCodecBrew.h"
56 #endif
57 #if OS(WINCE) && !PLATFORM(QT)
58 #include "TextCodecWinCE.h"
59 #endif
60 
61 #include <wtf/CurrentTime.h>
62 #include <wtf/text/CString.h>
63 
64 using namespace WTF;
65 
66 namespace WebCore {
67 
68 const size_t maxEncodingNameLength = 63;
69 
70 // Hash for all-ASCII strings that does case folding.
71 struct TextEncodingNameHash {
equalWebCore::TextEncodingNameHash72     static bool equal(const char* s1, const char* s2)
73     {
74         char c1;
75         char c2;
76         do {
77             c1 = *s1++;
78             c2 = *s2++;
79             if (toASCIILower(c1) != toASCIILower(c2))
80                 return false;
81         } while (c1 && c2);
82         return !c1 && !c2;
83     }
84 
85     // This algorithm is the one-at-a-time hash from:
86     // http://burtleburtle.net/bob/hash/hashfaq.html
87     // http://burtleburtle.net/bob/hash/doobs.html
hashWebCore::TextEncodingNameHash88     static unsigned hash(const char* s)
89     {
90         unsigned h = WTF::stringHashingStartValue;
91         for (;;) {
92             char c = *s++;
93             if (!c) {
94                 h += (h << 3);
95                 h ^= (h >> 11);
96                 h += (h << 15);
97                 return h;
98             }
99             h += toASCIILower(c);
100             h += (h << 10);
101             h ^= (h >> 6);
102         }
103     }
104 
105     static const bool safeToCompareToEmptyOrDeleted = false;
106 };
107 
108 struct TextCodecFactory {
109     NewTextCodecFunction function;
110     const void* additionalData;
TextCodecFactoryWebCore::TextCodecFactory111     TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
112 };
113 
114 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
115 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
116 
encodingRegistryMutex()117 static Mutex& encodingRegistryMutex()
118 {
119     // We don't have to use AtomicallyInitializedStatic here because
120     // this function is called on the main thread for any page before
121     // it is used in worker threads.
122     DEFINE_STATIC_LOCAL(Mutex, mutex, ());
123     return mutex;
124 }
125 
126 static TextEncodingNameMap* textEncodingNameMap;
127 static TextCodecMap* textCodecMap;
128 static bool didExtendTextCodecMaps;
129 static HashSet<const char*>* japaneseEncodings;
130 static HashSet<const char*>* nonBackslashEncodings;
131 
132 static const char* const textEncodingNameBlacklist[] = { "UTF-7" };
133 
134 #if ERROR_DISABLED
135 
checkExistingName(const char *,const char *)136 static inline void checkExistingName(const char*, const char*) { }
137 
138 #else
139 
checkExistingName(const char * alias,const char * atomicName)140 static void checkExistingName(const char* alias, const char* atomicName)
141 {
142     const char* oldAtomicName = textEncodingNameMap->get(alias);
143     if (!oldAtomicName)
144         return;
145     if (oldAtomicName == atomicName)
146         return;
147     // Keep the warning silent about one case where we know this will happen.
148     if (strcmp(alias, "ISO-8859-8-I") == 0
149             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
150             && strcasecmp(atomicName, "iso-8859-8") == 0)
151         return;
152     LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
153 }
154 
155 #endif
156 
isUndesiredAlias(const char * alias)157 static bool isUndesiredAlias(const char* alias)
158 {
159     // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
160     for (const char* p = alias; *p; ++p) {
161         if (*p == ',')
162             return true;
163     }
164     // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
165     // problem, see bug 43554.
166     if (0 == strcmp(alias, "8859_1"))
167         return true;
168     return false;
169 }
170 
addToTextEncodingNameMap(const char * alias,const char * name)171 static void addToTextEncodingNameMap(const char* alias, const char* name)
172 {
173     ASSERT(strlen(alias) <= maxEncodingNameLength);
174     if (isUndesiredAlias(alias))
175         return;
176     const char* atomicName = textEncodingNameMap->get(name);
177     ASSERT(strcmp(alias, name) == 0 || atomicName);
178     if (!atomicName)
179         atomicName = name;
180     checkExistingName(alias, atomicName);
181     textEncodingNameMap->add(alias, atomicName);
182 }
183 
addToTextCodecMap(const char * name,NewTextCodecFunction function,const void * additionalData)184 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
185 {
186     const char* atomicName = textEncodingNameMap->get(name);
187     ASSERT(atomicName);
188     textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
189 }
190 
pruneBlacklistedCodecs()191 static void pruneBlacklistedCodecs()
192 {
193     for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
194         const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
195         if (!atomicName)
196             continue;
197 
198         Vector<const char*> names;
199         TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
200         TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
201         for (; it != end; ++it) {
202             if (it->second == atomicName)
203                 names.append(it->first);
204         }
205 
206         size_t length = names.size();
207         for (size_t j = 0; j < length; ++j)
208             textEncodingNameMap->remove(names[j]);
209 
210         textCodecMap->remove(atomicName);
211     }
212 }
213 
buildBaseTextCodecMaps()214 static void buildBaseTextCodecMaps()
215 {
216     ASSERT(isMainThread());
217     ASSERT(!textCodecMap);
218     ASSERT(!textEncodingNameMap);
219 
220     textCodecMap = new TextCodecMap;
221     textEncodingNameMap = new TextEncodingNameMap;
222 
223     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
224     TextCodecLatin1::registerCodecs(addToTextCodecMap);
225 
226     TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
227     TextCodecUTF8::registerCodecs(addToTextCodecMap);
228 
229     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
230     TextCodecUTF16::registerCodecs(addToTextCodecMap);
231 
232     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
233     TextCodecUserDefined::registerCodecs(addToTextCodecMap);
234 
235 #if USE(GLIB_UNICODE)
236     // FIXME: This is not needed. The code above covers all the base codecs.
237     TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap);
238     TextCodecGtk::registerBaseCodecs(addToTextCodecMap);
239 #endif
240 }
241 
addEncodingName(HashSet<const char * > * set,const char * name)242 static void addEncodingName(HashSet<const char*>* set, const char* name)
243 {
244     // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
245     const char* atomicName = textEncodingNameMap->get(name);
246     if (atomicName)
247         set->add(atomicName);
248 }
249 
buildQuirksSets()250 static void buildQuirksSets()
251 {
252     // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
253     // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
254 
255     ASSERT(!japaneseEncodings);
256     ASSERT(!nonBackslashEncodings);
257 
258     japaneseEncodings = new HashSet<const char*>;
259     addEncodingName(japaneseEncodings, "EUC-JP");
260     addEncodingName(japaneseEncodings, "ISO-2022-JP");
261     addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
262     addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
263     addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
264     addEncodingName(japaneseEncodings, "JIS_C6226-1978");
265     addEncodingName(japaneseEncodings, "JIS_X0201");
266     addEncodingName(japaneseEncodings, "JIS_X0208-1983");
267     addEncodingName(japaneseEncodings, "JIS_X0208-1990");
268     addEncodingName(japaneseEncodings, "JIS_X0212-1990");
269     addEncodingName(japaneseEncodings, "Shift_JIS");
270     addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
271     addEncodingName(japaneseEncodings, "cp932");
272     addEncodingName(japaneseEncodings, "x-mac-japanese");
273 
274     nonBackslashEncodings = new HashSet<const char*>;
275     // The text encodings below treat backslash as a currency symbol for IE compatibility.
276     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
277     addEncodingName(nonBackslashEncodings, "x-mac-japanese");
278     addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
279     addEncodingName(nonBackslashEncodings, "EUC-JP");
280     // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
281     addEncodingName(nonBackslashEncodings, "Shift_JIS");
282     addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
283 }
284 
isJapaneseEncoding(const char * canonicalEncodingName)285 bool isJapaneseEncoding(const char* canonicalEncodingName)
286 {
287     return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
288 }
289 
shouldShowBackslashAsCurrencySymbolIn(const char * canonicalEncodingName)290 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
291 {
292     return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
293 }
294 
extendTextCodecMaps()295 static void extendTextCodecMaps()
296 {
297 #if USE(ICU_UNICODE)
298     TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
299     TextCodecICU::registerCodecs(addToTextCodecMap);
300 #endif
301 
302 #if USE(QT4_UNICODE)
303     TextCodecQt::registerEncodingNames(addToTextEncodingNameMap);
304     TextCodecQt::registerCodecs(addToTextCodecMap);
305 #endif
306 
307 #if PLATFORM(MAC)
308     TextCodecMac::registerEncodingNames(addToTextEncodingNameMap);
309     TextCodecMac::registerCodecs(addToTextCodecMap);
310 #endif
311 
312 #if USE(GLIB_UNICODE)
313     TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap);
314     TextCodecGtk::registerExtendedCodecs(addToTextCodecMap);
315 #endif
316 
317 #if OS(WINCE) && !PLATFORM(QT)
318     TextCodecWinCE::registerExtendedEncodingNames(addToTextEncodingNameMap);
319     TextCodecWinCE::registerExtendedCodecs(addToTextCodecMap);
320 #endif
321 
322     pruneBlacklistedCodecs();
323     buildQuirksSets();
324 }
325 
newTextCodec(const TextEncoding & encoding)326 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
327 {
328     MutexLocker lock(encodingRegistryMutex());
329 
330     ASSERT(textCodecMap);
331     TextCodecFactory factory = textCodecMap->get(encoding.name());
332     ASSERT(factory.function);
333     return factory.function(encoding, factory.additionalData);
334 }
335 
atomicCanonicalTextEncodingName(const char * name)336 const char* atomicCanonicalTextEncodingName(const char* name)
337 {
338     if (!name || !name[0])
339         return 0;
340     if (!textEncodingNameMap)
341         buildBaseTextCodecMaps();
342 
343     MutexLocker lock(encodingRegistryMutex());
344 
345     if (const char* atomicName = textEncodingNameMap->get(name))
346         return atomicName;
347     if (didExtendTextCodecMaps)
348         return 0;
349     extendTextCodecMaps();
350     didExtendTextCodecMaps = true;
351     return textEncodingNameMap->get(name);
352 }
353 
atomicCanonicalTextEncodingName(const UChar * characters,size_t length)354 const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t length)
355 {
356     char buffer[maxEncodingNameLength + 1];
357     size_t j = 0;
358     for (size_t i = 0; i < length; ++i) {
359         UChar c = characters[i];
360         if (j == maxEncodingNameLength)
361             return 0;
362         buffer[j++] = c;
363     }
364     buffer[j] = 0;
365     return atomicCanonicalTextEncodingName(buffer);
366 }
367 
noExtendedTextEncodingNameUsed()368 bool noExtendedTextEncodingNameUsed()
369 {
370     // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
371     return !didExtendTextCodecMaps;
372 }
373 
374 #ifndef NDEBUG
dumpTextEncodingNameMap()375 void dumpTextEncodingNameMap()
376 {
377     unsigned size = textEncodingNameMap->size();
378     fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size);
379 
380     MutexLocker lock(encodingRegistryMutex());
381 
382     TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
383     TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
384     for (; it != end; ++it)
385         fprintf(stderr, "'%s' => '%s'\n", it->first, it->second);
386 }
387 #endif
388 
389 } // namespace WebCore
390