1 /*
2     Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3     Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4     Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 
6     This library is free software; you can redistribute it and/or
7     modify it under the terms of the GNU Library General Public
8     License as published by the Free Software Foundation; either
9     version 2 of the License, or (at your option) any later version.
10 
11     This library is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14     Library General Public License for more details.
15 
16     You should have received a copy of the GNU Library General Public License
17     along with this library; see the file COPYING.LIB.  If not, write to
18     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19     Boston, MA 02110-1301, USA.
20 */
21 
22 
23 #include "config.h"
24 #include "TextResourceDecoder.h"
25 
26 #include "DOMImplementation.h"
27 #include "HTMLMetaCharsetParser.h"
28 #include "HTMLNames.h"
29 #include "TextCodec.h"
30 #include "TextEncoding.h"
31 #include "TextEncodingDetector.h"
32 #include "TextEncodingRegistry.h"
33 #include <wtf/ASCIICType.h>
34 #include <wtf/StringExtras.h>
35 
36 using namespace WTF;
37 
38 namespace WebCore {
39 
40 using namespace HTMLNames;
41 
42 // You might think we should put these find functions elsewhere, perhaps with the
43 // similar functions that operate on UChar, but arguably only the decoder has
44 // a reason to process strings of char rather than UChar.
45 
find(const char * subject,size_t subjectLength,const char * target)46 static int find(const char* subject, size_t subjectLength, const char* target)
47 {
48     size_t targetLength = strlen(target);
49     if (targetLength > subjectLength)
50         return -1;
51     for (size_t i = 0; i <= subjectLength - targetLength; ++i) {
52         bool match = true;
53         for (size_t j = 0; j < targetLength; ++j) {
54             if (subject[i + j] != target[j]) {
55                 match = false;
56                 break;
57             }
58         }
59         if (match)
60             return i;
61     }
62     return -1;
63 }
64 
findTextEncoding(const char * encodingName,int length)65 static TextEncoding findTextEncoding(const char* encodingName, int length)
66 {
67     Vector<char, 64> buffer(length + 1);
68     memcpy(buffer.data(), encodingName, length);
69     buffer[length] = '\0';
70     return buffer.data();
71 }
72 
73 class KanjiCode {
74 public:
75     enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };
76     static enum Type judge(const char* str, int length);
77     static const int ESC = 0x1b;
78     static const unsigned char sjisMap[256];
ISkanji(int code)79     static int ISkanji(int code)
80     {
81         if (code >= 0x100)
82             return 0;
83         return sjisMap[code & 0xff] & 1;
84     }
ISkana(int code)85     static int ISkana(int code)
86     {
87         if (code >= 0x100)
88             return 0;
89         return sjisMap[code & 0xff] & 2;
90     }
91 };
92 
93 const unsigned char KanjiCode::sjisMap[256] = {
94     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
98     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
104     0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
105     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
106     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
107     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
108     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
109     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
110 };
111 
112 /*
113  * EUC-JP is
114  *     [0xa1 - 0xfe][0xa1 - 0xfe]
115  *     0x8e[0xa1 - 0xfe](SS2)
116  *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
117  *
118  * Shift_Jis is
119  *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
120  *
121  * Shift_Jis Hankaku Kana is
122  *     [0xa1 - 0xdf]
123  */
124 
125 /*
126  * KanjiCode::judge() is based on judge_jcode() from jvim
127  *     http://hp.vector.co.jp/authors/VA003457/vim/
128  *
129  * Special Thanks to Kenichi Tsuchida
130  */
131 
judge(const char * str,int size)132 enum KanjiCode::Type KanjiCode::judge(const char* str, int size)
133 {
134     enum Type code;
135     int i;
136     int bfr = false;            /* Kana Moji */
137     int bfk = 0;                /* EUC Kana */
138     int sjis = 0;
139     int euc = 0;
140 
141     const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);
142 
143     code = ASCII;
144 
145     i = 0;
146     while (i < size) {
147         if (ptr[i] == ESC && (size - i >= 3)) {
148             if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
149             || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
150                 code = JIS;
151                 goto breakBreak;
152             } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
153                     || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
154                 code = JIS;
155                 goto breakBreak;
156             } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
157                 code = JIS;
158                 i += 3;
159             } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
160                 code = JIS;
161                 i += 3;
162             } else {
163                 i++;
164             }
165             bfr = false;
166             bfk = 0;
167         } else {
168             if (ptr[i] < 0x20) {
169                 bfr = false;
170                 bfk = 0;
171                 /* ?? check kudokuten ?? && ?? hiragana ?? */
172                 if ((i >= 2) && (ptr[i - 2] == 0x81)
173                         && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
174                     code = SJIS;
175                     sjis += 100;        /* kudokuten */
176                 } else if ((i >= 2) && (ptr[i - 2] == 0xa1)
177                         && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
178                     code = EUC;
179                     euc += 100;         /* kudokuten */
180                 } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
181                     sjis += 40;         /* hiragana */
182                 } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
183                     euc += 40;          /* hiragana */
184                 }
185             } else {
186                 /* ?? check hiragana or katana ?? */
187                 if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
188                     sjis++;     /* hiragana */
189                 } else if ((size - i > 1) && (ptr[i] == 0x83)
190                          && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
191                     sjis++;     /* katakana */
192                 } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
193                     euc++;      /* hiragana */
194                 } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
195                     euc++;      /* katakana */
196                 }
197                 if (bfr) {
198                     if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
199                         code = SJIS;
200                         goto breakBreak;
201                     } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
202                         code = SJIS;
203                         goto breakBreak;
204                     } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
205                         code = EUC;
206                         goto breakBreak;
207                     } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
208                         code = EUC;
209                         goto breakBreak;
210                     } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
211                         code = SJIS;
212                         goto breakBreak;
213                     } else if (ptr[i] <= 0x7f) {
214                         code = SJIS;
215                         goto breakBreak;
216                     } else {
217                         if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
218                             euc++;      /* sjis hankaku kana kigo */
219                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
220                             ;           /* sjis hankaku kana */
221                         } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
222                             euc++;
223                         } else if (0x8e == ptr[i]) {
224                             euc++;
225                         } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
226                             sjis++;
227                         }
228                         bfr = false;
229                         bfk = 0;
230                     }
231                 } else if (0x8e == ptr[i]) {
232                     if (size - i <= 1) {
233                         ;
234                     } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
235                         /* EUC KANA or SJIS KANJI */
236                         if (bfk == 1) {
237                             euc += 100;
238                         }
239                         bfk++;
240                         i++;
241                     } else {
242                         /* SJIS only */
243                         code = SJIS;
244                         goto breakBreak;
245                     }
246                 } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
247                     /* SJIS only */
248                     code = SJIS;
249                     if ((size - i >= 1)
250                             && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
251                             || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
252                         goto breakBreak;
253                     }
254                 } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
255                     /* EUC only */
256                     code = EUC;
257                     if ((size - i >= 1)
258                             && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
259                         goto breakBreak;
260                     }
261                 } else if (ptr[i] <= 0x7f) {
262                     ;
263                 } else {
264                     bfr = true;
265                     bfk = 0;
266                 }
267             }
268             i++;
269         }
270     }
271     if (code == ASCII) {
272         if (sjis > euc) {
273             code = SJIS;
274         } else if (sjis < euc) {
275             code = EUC;
276         }
277     }
278 breakBreak:
279     return (code);
280 }
281 
determineContentType(const String & mimeType)282 TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType)
283 {
284     if (equalIgnoringCase(mimeType, "text/css"))
285         return CSS;
286     if (equalIgnoringCase(mimeType, "text/html"))
287         return HTML;
288     if (DOMImplementation::isXMLMIMEType(mimeType))
289         return XML;
290     return PlainText;
291 }
292 
defaultEncoding(ContentType contentType,const TextEncoding & specifiedDefaultEncoding)293 const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding)
294 {
295     // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
296     // for text/xml. This matches Firefox.
297     if (contentType == XML)
298         return UTF8Encoding();
299     if (!specifiedDefaultEncoding.isValid())
300         return Latin1Encoding();
301     return specifiedDefaultEncoding;
302 }
303 
TextResourceDecoder(const String & mimeType,const TextEncoding & specifiedDefaultEncoding,bool usesEncodingDetector)304 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding, bool usesEncodingDetector)
305     : m_contentType(determineContentType(mimeType))
306     , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
307     , m_source(DefaultEncoding)
308     , m_hintEncoding(0)
309     , m_checkedForBOM(false)
310     , m_checkedForCSSCharset(false)
311     , m_checkedForHeadCharset(false)
312     , m_useLenientXMLDecoding(false)
313     , m_sawError(false)
314     , m_usesEncodingDetector(usesEncodingDetector)
315 {
316 }
317 
~TextResourceDecoder()318 TextResourceDecoder::~TextResourceDecoder()
319 {
320 }
321 
setEncoding(const TextEncoding & encoding,EncodingSource source)322 void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source)
323 {
324     // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
325     if (!encoding.isValid())
326         return;
327 
328     // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),
329     // treat x-user-defined as windows-1252 (bug 18270)
330     if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)
331         m_encoding = "windows-1252";
332     else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)
333         m_encoding = encoding.closestByteBasedEquivalent();
334     else
335         m_encoding = encoding;
336 
337     m_codec.clear();
338     m_source = source;
339 }
340 
341 // Returns the position of the encoding string.
findXMLEncoding(const char * str,int len,int & encodingLength)342 static int findXMLEncoding(const char* str, int len, int& encodingLength)
343 {
344     int pos = find(str, len, "encoding");
345     if (pos == -1)
346         return -1;
347     pos += 8;
348 
349     // Skip spaces and stray control characters.
350     while (pos < len && str[pos] <= ' ')
351         ++pos;
352 
353     // Skip equals sign.
354     if (pos >= len || str[pos] != '=')
355         return -1;
356     ++pos;
357 
358     // Skip spaces and stray control characters.
359     while (pos < len && str[pos] <= ' ')
360         ++pos;
361 
362     // Skip quotation mark.
363     if (pos >= len)
364         return - 1;
365     char quoteMark = str[pos];
366     if (quoteMark != '"' && quoteMark != '\'')
367         return -1;
368     ++pos;
369 
370     // Find the trailing quotation mark.
371     int end = pos;
372     while (end < len && str[end] != quoteMark)
373         ++end;
374     if (end >= len)
375         return -1;
376 
377     encodingLength = end - pos;
378     return pos;
379 }
380 
381 // true if there is more to parse
skipWhitespace(const char * & pos,const char * dataEnd)382 static inline bool skipWhitespace(const char*& pos, const char* dataEnd)
383 {
384     while (pos < dataEnd && (*pos == '\t' || *pos == ' '))
385         ++pos;
386     return pos != dataEnd;
387 }
388 
checkForBOM(const char * data,size_t len)389 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
390 {
391     // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
392     // We let it override even a user-chosen encoding.
393     ASSERT(!m_checkedForBOM);
394 
395     size_t lengthOfBOM = 0;
396 
397     size_t bufferLength = m_buffer.size();
398 
399     size_t buf1Len = bufferLength;
400     size_t buf2Len = len;
401     const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.data());
402     const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
403     unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
404     unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
405     unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *buf2++) : 0;
406     unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
407 
408     // Check for the BOM.
409     if (c1 == 0xFF && c2 == 0xFE) {
410         if (c3 != 0 || c4 != 0) {
411             setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
412             lengthOfBOM = 2;
413         } else {
414             setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
415             lengthOfBOM = 4;
416         }
417     } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
418         setEncoding(UTF8Encoding(), AutoDetectedEncoding);
419         lengthOfBOM = 3;
420     } else if (c1 == 0xFE && c2 == 0xFF) {
421         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
422         lengthOfBOM = 2;
423     } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
424         setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
425         lengthOfBOM = 4;
426     }
427 
428     if (lengthOfBOM || bufferLength + len >= 4)
429         m_checkedForBOM = true;
430 
431     return lengthOfBOM;
432 }
433 
checkForCSSCharset(const char * data,size_t len,bool & movedDataToBuffer)434 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
435 {
436     if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
437         m_checkedForCSSCharset = true;
438         return true;
439     }
440 
441     size_t oldSize = m_buffer.size();
442     m_buffer.grow(oldSize + len);
443     memcpy(m_buffer.data() + oldSize, data, len);
444 
445     movedDataToBuffer = true;
446 
447     if (m_buffer.size() > 8) { // strlen("@charset") == 8
448         const char* dataStart = m_buffer.data();
449         const char* dataEnd = dataStart + m_buffer.size();
450 
451         if (dataStart[0] == '@' && dataStart[1] == 'c' && dataStart[2] == 'h' && dataStart[3] == 'a' && dataStart[4] == 'r' &&
452             dataStart[5] == 's' && dataStart[6] == 'e' && dataStart[7] == 't') {
453 
454             dataStart += 8;
455             const char* pos = dataStart;
456             if (!skipWhitespace(pos, dataEnd))
457                 return false;
458 
459             if (*pos == '"' || *pos == '\'') {
460                 char quotationMark = *pos;
461                 ++pos;
462                 dataStart = pos;
463 
464                 while (pos < dataEnd && *pos != quotationMark)
465                     ++pos;
466                 if (pos == dataEnd)
467                     return false;
468 
469                 int encodingNameLength = pos - dataStart;
470 
471                 ++pos;
472                 if (!skipWhitespace(pos, dataEnd))
473                     return false;
474 
475                 if (*pos == ';')
476                     setEncoding(findTextEncoding(dataStart, encodingNameLength), EncodingFromCSSCharset);
477             }
478         }
479         m_checkedForCSSCharset = true;
480         return true;
481     }
482     return false;
483 }
484 
485 // Other browsers allow comments in the head section, so we need to also.
486 // It's important not to look for tags inside the comments.
skipComment(const char * & ptr,const char * pEnd)487 static inline void skipComment(const char*& ptr, const char* pEnd)
488 {
489     const char* p = ptr;
490     if (p == pEnd)
491       return;
492     // Allow <!-->; other browsers do.
493     if (*p == '>') {
494         p++;
495     } else {
496         while (p + 2 < pEnd) {
497             if (*p == '-') {
498                 // This is the real end of comment, "-->".
499                 if (p[1] == '-' && p[2] == '>') {
500                     p += 3;
501                     break;
502                 }
503                 // This is the incorrect end of comment that other browsers allow, "--!>".
504                 if (p + 3 < pEnd && p[1] == '-' && p[2] == '!' && p[3] == '>') {
505                     p += 4;
506                     break;
507                 }
508             }
509             p++;
510         }
511     }
512     ptr = p;
513 }
514 
checkForHeadCharset(const char * data,size_t len,bool & movedDataToBuffer)515 bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
516 {
517     if (m_source != DefaultEncoding && m_source != EncodingFromParentFrame) {
518         m_checkedForHeadCharset = true;
519         return true;
520     }
521 
522     // This is not completely efficient, since the function might go
523     // through the HTML head several times.
524 
525     size_t oldSize = m_buffer.size();
526     m_buffer.grow(oldSize + len);
527     memcpy(m_buffer.data() + oldSize, data, len);
528 
529     movedDataToBuffer = true;
530 
531     // Continue with checking for an HTML meta tag if we were already doing so.
532     if (m_charsetParser)
533         return checkForMetaCharset(data, len);
534 
535     const char* ptr = m_buffer.data();
536     const char* pEnd = ptr + m_buffer.size();
537 
538     // Is there enough data available to check for XML declaration?
539     if (m_buffer.size() < 8)
540         return false;
541 
542     // Handle XML declaration, which can have encoding in it. This encoding is honored even for HTML documents.
543     // It is an error for an XML declaration not to be at the start of an XML document, and it is ignored in HTML documents in such case.
544     if (ptr[0] == '<' && ptr[1] == '?' && ptr[2] == 'x' && ptr[3] == 'm' && ptr[4] == 'l') {
545         const char* xmlDeclarationEnd = ptr;
546         while (xmlDeclarationEnd != pEnd && *xmlDeclarationEnd != '>')
547             ++xmlDeclarationEnd;
548         if (xmlDeclarationEnd == pEnd)
549             return false;
550         // No need for +1, because we have an extra "?" to lose at the end of XML declaration.
551         int len = 0;
552         int pos = findXMLEncoding(ptr, xmlDeclarationEnd - ptr, len);
553         if (pos != -1)
554             setEncoding(findTextEncoding(ptr + pos, len), EncodingFromXMLHeader);
555         // continue looking for a charset - it may be specified in an HTTP-Equiv meta
556     } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == '?' && ptr[3] == 0 && ptr[4] == 'x' && ptr[5] == 0) {
557         setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
558         return true;
559     } else if (ptr[0] == 0 && ptr[1] == '<' && ptr[2] == 0 && ptr[3] == '?' && ptr[4] == 0 && ptr[5] == 'x') {
560         setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
561         return true;
562     } else if (ptr[0] == '<' && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0 && ptr[4] == '?' && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == 0) {
563         setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
564         return true;
565     } else if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == '<' && ptr[4] == 0 && ptr[5] == 0 && ptr[6] == 0 && ptr[7] == '?') {
566         setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
567         return true;
568     }
569 
570     // The HTTP-EQUIV meta has no effect on XHTML.
571     if (m_contentType == XML)
572         return true;
573 
574     m_charsetParser = HTMLMetaCharsetParser::create();
575     return checkForMetaCharset(data, len);
576 }
577 
checkForMetaCharset(const char * data,size_t length)578 bool TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
579 {
580     if (!m_charsetParser->checkForMetaCharset(data, length))
581         return false;
582 
583     setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
584     m_charsetParser.clear();
585     m_checkedForHeadCharset = true;
586     return true;
587 }
588 
detectJapaneseEncoding(const char * data,size_t len)589 void TextResourceDecoder::detectJapaneseEncoding(const char* data, size_t len)
590 {
591     switch (KanjiCode::judge(data, len)) {
592         case KanjiCode::JIS:
593             setEncoding("ISO-2022-JP", AutoDetectedEncoding);
594             break;
595         case KanjiCode::EUC:
596             setEncoding("EUC-JP", AutoDetectedEncoding);
597             break;
598         case KanjiCode::SJIS:
599             setEncoding("Shift_JIS", AutoDetectedEncoding);
600             break;
601         case KanjiCode::ASCII:
602         case KanjiCode::UTF16:
603         case KanjiCode::UTF8:
604             break;
605     }
606 }
607 
608 // We use the encoding detector in two cases:
609 //   1. Encoding detector is turned ON and no other encoding source is
610 //      available (that is, it's DefaultEncoding).
611 //   2. Encoding detector is turned ON and the encoding is set to
612 //      the encoding of the parent frame, which is also auto-detected.
613 //   Note that condition #2 is NOT satisfied unless parent-child frame
614 //   relationship is compliant to the same-origin policy. If they're from
615 //   different domains, |m_source| would not be set to EncodingFromParentFrame
616 //   in the first place.
shouldAutoDetect() const617 bool TextResourceDecoder::shouldAutoDetect() const
618 {
619     // Just checking m_hintEncoding suffices here because it's only set
620     // in setHintEncoding when the source is AutoDetectedEncoding.
621     return m_usesEncodingDetector
622         && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
623 }
624 
decode(const char * data,size_t len)625 String TextResourceDecoder::decode(const char* data, size_t len)
626 {
627     size_t lengthOfBOM = 0;
628     if (!m_checkedForBOM)
629         lengthOfBOM = checkForBOM(data, len);
630 
631     bool movedDataToBuffer = false;
632 
633     if (m_contentType == CSS && !m_checkedForCSSCharset)
634         if (!checkForCSSCharset(data, len, movedDataToBuffer))
635             return "";
636 
637     if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
638         if (!checkForHeadCharset(data, len, movedDataToBuffer))
639             return "";
640 
641     // FIXME: It is wrong to change the encoding downstream after we have already done some decoding.
642     if (shouldAutoDetect()) {
643         if (m_encoding.isJapanese())
644             detectJapaneseEncoding(data, len); // FIXME: We should use detectTextEncoding() for all languages.
645         else {
646             TextEncoding detectedEncoding;
647             if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
648                 setEncoding(detectedEncoding, AutoDetectedEncoding);
649         }
650     }
651 
652     ASSERT(m_encoding.isValid());
653 
654     if (!m_codec)
655         m_codec = newTextCodec(m_encoding);
656 
657     if (m_buffer.isEmpty())
658         return m_codec->decode(data + lengthOfBOM, len - lengthOfBOM, false, m_contentType == XML, m_sawError);
659 
660     if (!movedDataToBuffer) {
661         size_t oldSize = m_buffer.size();
662         m_buffer.grow(oldSize + len);
663         memcpy(m_buffer.data() + oldSize, data, len);
664     }
665 
666     String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
667     m_buffer.clear();
668     return result;
669 }
670 
flush()671 String TextResourceDecoder::flush()
672 {
673    // If we can not identify the encoding even after a document is completely
674    // loaded, we need to detect the encoding if other conditions for
675    // autodetection is satisfied.
676     if (m_buffer.size() && shouldAutoDetect()
677         && ((!m_checkedForHeadCharset && (m_contentType == HTML || m_contentType == XML)) || (!m_checkedForCSSCharset && (m_contentType == CSS)))) {
678          TextEncoding detectedEncoding;
679          if (detectTextEncoding(m_buffer.data(), m_buffer.size(),
680                                 m_hintEncoding, &detectedEncoding))
681              setEncoding(detectedEncoding, AutoDetectedEncoding);
682     }
683 
684     if (!m_codec)
685         m_codec = newTextCodec(m_encoding);
686 
687     String result = m_codec->decode(m_buffer.data(), m_buffer.size(), true, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
688     m_buffer.clear();
689     m_codec.clear();
690     m_checkedForBOM = false; // Skip BOM again when re-decoding.
691     return result;
692 }
693 
694 }
695