1 /*
2  * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are
6  * met:
7  *
8  *     * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  *     * Redistributions in binary form must reproduce the above
11  * copyright notice, this list of conditions and the following disclaimer
12  * in the documentation and/or other materials provided with the
13  * distribution.
14  *     * Neither the name of Google Inc. nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "config.h"
32 #include "TextEncodingDetector.h"
33 
34 #include "TextEncoding.h"
35 #include <wtf/UnusedParam.h>
36 
37 #include "unicode/ucnv.h"
38 #include "unicode/ucsdet.h"
39 
40 namespace WebCore {
41 
detectTextEncoding(const char * data,size_t len,const char * hintEncodingName,TextEncoding * detectedEncoding)42 bool detectTextEncoding(const char* data, size_t len,
43                         const char* hintEncodingName,
44                         TextEncoding* detectedEncoding)
45 {
46     *detectedEncoding = TextEncoding();
47     int matchesCount = 0;
48     UErrorCode status = U_ZERO_ERROR;
49     UCharsetDetector* detector = ucsdet_open(&status);
50     if (U_FAILURE(status))
51         return false;
52     ucsdet_enableInputFilter(detector, true);
53     ucsdet_setText(detector, data, static_cast<int32_t>(len), &status);
54     if (U_FAILURE(status))
55         return false;
56 
57     // FIXME: A few things we can do other than improving
58     // the ICU detector itself.
59     // 1. Use ucsdet_detectAll and pick the most likely one given
60     // "the context" (parent-encoding, referrer encoding, etc).
61     // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
62     // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
63     // encoding with a highest confidence among the detector-specific
64     // limited set of candidate encodings.
65     // Below is a partial implementation of the first part of what's outlined
66     // above.
67     const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
68     if (U_FAILURE(status)) {
69         ucsdet_close(detector);
70         return false;
71     }
72 
73     const char* encoding = 0;
74     if (hintEncodingName) {
75         TextEncoding hintEncoding(hintEncodingName);
76         // 10 is the minimum confidence value consistent with the codepoint
77         // allocation in a given encoding. The size of a chunk passed to
78         // us varies even for the same html file (apparently depending on
79         // the network load). When we're given a rather short chunk, we
80         // don't have a sufficiently reliable signal other than the fact that
81         // the chunk is consistent with a set of encodings. So, instead of
82         // setting an arbitrary threshold, we have to scan all the encodings
83         // consistent with the data.
84         const int32_t kThresold = 10;
85         for (int i = 0; i < matchesCount; ++i) {
86             int32_t confidence = ucsdet_getConfidence(matches[i], &status);
87             if (U_FAILURE(status)) {
88                 status = U_ZERO_ERROR;
89                 continue;
90             }
91             if (confidence < kThresold)
92                 break;
93             const char* matchEncoding = ucsdet_getName(matches[i], &status);
94             if (U_FAILURE(status)) {
95                 status = U_ZERO_ERROR;
96                 continue;
97             }
98             if (TextEncoding(matchEncoding) == hintEncoding) {
99                 encoding = hintEncodingName;
100                 break;
101             }
102         }
103     }
104     // If no match is found so far, just pick the top match.
105     // This can happen, say, when a parent frame in EUC-JP refers to
106     // a child frame in Shift_JIS and both frames do NOT specify the encoding
107     // making us resort to auto-detection (when it IS turned on).
108     if (!encoding && matchesCount > 0)
109         encoding = ucsdet_getName(matches[0], &status);
110     if (U_SUCCESS(status)) {
111         *detectedEncoding = TextEncoding(encoding);
112         ucsdet_close(detector);
113         return true;
114     }
115     ucsdet_close(detector);
116     return false;
117 }
118 
119 }
120