1 /*
2 This file is part of the KDE libraries
3
4 Copyright (C) 1999 Lars Knoll (knoll@kde.org)
5 Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
6 Copyright (C) 2003 Apple Computer, Inc.
7 Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
8
9 This library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Library General Public
11 License as published by the Free Software Foundation; either
12 version 2 of the License, or (at your option) any later version.
13
14 This library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Library General Public License for more details.
18
19 You should have received a copy of the GNU Library General Public License
20 along with this library; see the file COPYING.LIB. If not, write to
21 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22 Boston, MA 02110-1301, USA.
23 */
24 //----------------------------------------------------------------------------
25 //
26 // decoder for input stream
27
28 #include "kencodingdetector.h"
29
30 #undef DECODE_DEBUG
31 //#define DECODE_DEBUG
32
33 #define MAX_BUFFER 16*1024
34
35 #include <assert.h>
36
37 #include "guess_ja_p.h"
38
39 #include "khtml_debug.h"
40 #include <QRegExp>
41 #include <QTextCodec>
42
43 #include "kcharsets.h"
44 #include <klocalizedstring.h>
45
46 #include <ctype.h>
47
48 enum MIB {
49 MibLatin1 = 4,
50 Mib8859_8 = 85,
51 MibUtf8 = 106,
52 MibUcs2 = 1000,
53 MibUtf16 = 1015,
54 MibUtf16BE = 1013,
55 MibUtf16LE = 1014
56 };
57
is16Bit(QTextCodec * codec)58 static bool is16Bit(QTextCodec *codec)
59 {
60 switch (codec->mibEnum()) {
61 case MibUtf16:
62 case MibUtf16BE:
63 case MibUtf16LE:
64 case MibUcs2:
65 return true;
66 default:
67 return false;
68 }
69 }
70
71 class KEncodingDetectorPrivate
72 {
73 public:
74 QTextCodec *m_codec;
75 QTextDecoder *m_decoder; // utf16
76 QTextCodec *m_defaultCodec;
77 QByteArray m_storeDecoderName;
78
79 KEncodingDetector::EncodingChoiceSource m_source;
80 KEncodingDetector::AutoDetectScript m_autoDetectLanguage;
81
82 bool m_visualRTL : 1;
83 bool m_seenBody : 1;
84 bool m_writtingHappened : 1;
85 bool m_analyzeCalled : 1; //for decode()
86 int m_multiByte;
87
88 QByteArray m_bufferForDefferedEncDetection;
89
KEncodingDetectorPrivate()90 KEncodingDetectorPrivate()
91 : m_codec(QTextCodec::codecForMib(MibLatin1))
92 , m_decoder(m_codec->makeDecoder())
93 , m_defaultCodec(m_codec)
94 , m_source(KEncodingDetector::DefaultEncoding)
95 , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
96 , m_visualRTL(false)
97 , m_seenBody(false)
98 , m_writtingHappened(false)
99 , m_analyzeCalled(false)
100 , m_multiByte(0)
101 {
102 }
103
KEncodingDetectorPrivate(QTextCodec * codec,KEncodingDetector::EncodingChoiceSource source,KEncodingDetector::AutoDetectScript script)104 KEncodingDetectorPrivate(QTextCodec *codec, KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script)
105 : m_codec(codec)
106 , m_decoder(m_codec->makeDecoder())
107 , m_defaultCodec(m_codec)
108 , m_source(source)
109 , m_autoDetectLanguage(script)
110 , m_visualRTL(false)
111 , m_seenBody(false)
112 , m_writtingHappened(false)
113 , m_analyzeCalled(false)
114 , m_multiByte(0)
115 {
116 }
117
~KEncodingDetectorPrivate()118 ~KEncodingDetectorPrivate()
119 {
120 delete m_decoder;
121 }
122
123 // Returns true if the encoding was explicitly specified someplace.
isExplicitlySpecifiedEncoding()124 bool isExplicitlySpecifiedEncoding()
125 {
126 return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding;
127 }
128 };
129
automaticDetectionForArabic(const unsigned char * ptr,int size)130 static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size)
131 {
132 for (int i = 0; i < size; ++i) {
133 if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
134 || (ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB) || (ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA)
135 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
136 || (ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF) || (ptr[ i ] >= 0xF3)) {
137 return "cp1256";
138 }
139 }
140
141 return "iso-8859-6";
142 }
143
automaticDetectionForBaltic(const unsigned char * ptr,int size)144 static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size)
145 {
146 for (int i = 0; i < size; ++i) {
147 if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E)) {
148 return "cp1257";
149 }
150
151 if (ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5) {
152 return "iso-8859-13";
153 }
154 }
155
156 return "iso-8859-13";
157 }
158
automaticDetectionForCentralEuropean(const unsigned char * ptr,int size)159 static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size)
160 {
161 QByteArray charset = QByteArray();
162 for (int i = 0; i < size; ++i) {
163 if (ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) {
164 if (ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98) {
165 return "ibm852";
166 }
167
168 if (i + 1 > size) {
169 return "cp1250";
170 } else { // maybe ibm852 ?
171 charset = "cp1250";
172 continue;
173 }
174 }
175 if (ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0) {
176 if (i + 1 > size) {
177 return "iso-8859-2";
178 } else { // maybe ibm852 ?
179 if (charset.isNull()) {
180 charset = "iso-8859-2";
181 }
182 continue;
183 }
184 }
185 }
186
187 if (charset.isNull()) {
188 charset = "iso-8859-3";
189 }
190
191 return charset.data();
192 }
193
automaticDetectionForCyrillic(const unsigned char * ptr,int size)194 static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size)
195 {
196 #ifdef DECODE_DEBUG
197 qCWarning(KHTML_LOG) << "KEncodingDetector: Cyr heuristics";
198 #endif
199
200 // if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
201 // return "utf8";
202 int utf8_mark = 0;
203 int koi_score = 0;
204 int cp1251_score = 0;
205
206 int koi_st = 0;
207 int cp1251_st = 0;
208
209 // int koi_na=0;
210 // int cp1251_na=0;
211
212 int koi_o_capital = 0;
213 int koi_o = 0;
214 int cp1251_o_capital = 0;
215 int cp1251_o = 0;
216
217 int koi_a_capital = 0;
218 int koi_a = 0;
219 int cp1251_a_capital = 0;
220 int cp1251_a = 0;
221
222 int koi_s_capital = 0;
223 int koi_s = 0;
224 int cp1251_s_capital = 0;
225 int cp1251_s = 0;
226
227 int koi_i_capital = 0;
228 int koi_i = 0;
229 int cp1251_i_capital = 0;
230 int cp1251_i = 0;
231
232 int cp1251_small_range = 0;
233 int koi_small_range = 0;
234 int ibm866_small_range = 0;
235
236 int i;
237 for (i = 1; (i < size) && (cp1251_small_range + koi_small_range < 1000); ++i) {
238 if (ptr[i] > 0xdf) {
239 ++cp1251_small_range;
240
241 if (ptr[i] == 0xee) { //small o
242 ++cp1251_o;
243 } else if (ptr[i] == 0xe0) { //small a
244 ++cp1251_a;
245 } else if (ptr[i] == 0xe8) { //small i
246 ++cp1251_i;
247 } else if (ptr[i] == 0xf1) { //small s
248 ++cp1251_s;
249 } else if (ptr[i] == 0xf2 && ptr[i - 1] == 0xf1) { //small st
250 ++cp1251_st;
251 }
252
253 else if (ptr[i] == 0xef) {
254 ++koi_o_capital;
255 } else if (ptr[i] == 0xe1) {
256 ++koi_a_capital;
257 } else if (ptr[i] == 0xe9) {
258 ++koi_i_capital;
259 } else if (ptr[i] == 0xf3) {
260 ++koi_s_capital;
261 }
262
263 } else if (ptr[i] > 0xbf) {
264 ++koi_small_range;
265
266 if (ptr[i] == 0xd0 || ptr[i] == 0xd1) { //small o
267 ++utf8_mark;
268 } else if (ptr[i] == 0xcf) { //small o
269 ++koi_o;
270 } else if (ptr[i] == 0xc1) { //small a
271 ++koi_a;
272 } else if (ptr[i] == 0xc9) { //small i
273 ++koi_i;
274 } else if (ptr[i] == 0xd3) { //small s
275 ++koi_s;
276 } else if (ptr[i] == 0xd4 && ptr[i - 1] == 0xd3) { //small st
277 ++koi_st;
278 }
279
280 else if (ptr[i] == 0xce) {
281 ++cp1251_o_capital;
282 } else if (ptr[i] == 0xc0) {
283 ++cp1251_a_capital;
284 } else if (ptr[i] == 0xc8) {
285 ++cp1251_i_capital;
286 } else if (ptr[i] == 0xd1) {
287 ++cp1251_s_capital;
288 }
289 } else if (ptr[i] > 0x9f && ptr[i] < 0xb0) { //first 16 letterz is 60%
290 ++ibm866_small_range;
291 }
292
293 }
294
295 //cannot decide?
296 if (cp1251_small_range + koi_small_range + ibm866_small_range < 8) {
297 return "";
298 }
299
300 if (3 * utf8_mark > cp1251_small_range + koi_small_range + ibm866_small_range) {
301 #ifdef DECODE_DEBUG
302 qCWarning(KHTML_LOG) << "Cyr Enc Detection: UTF8";
303 #endif
304 return "UTF-8";
305 }
306
307 if (ibm866_small_range > cp1251_small_range + koi_small_range) {
308 return "ibm866";
309 }
310
311 // QByteArray koi_string = "koi8-u";
312 // QByteArray cp1251_string = "cp1251";
313
314 if (cp1251_st == 0 && koi_st > 1) {
315 koi_score += 10;
316 } else if (koi_st == 0 && cp1251_st > 1) {
317 cp1251_score += 10;
318 }
319
320 if (cp1251_st && koi_st) {
321 if (cp1251_st / koi_st > 2) {
322 cp1251_score += 20;
323 } else if (koi_st / cp1251_st > 2) {
324 koi_score += 20;
325 }
326 }
327
328 if (cp1251_a > koi_a) {
329 cp1251_score += 10;
330 } else if (cp1251_a || koi_a) {
331 koi_score += 10;
332 }
333
334 if (cp1251_o > koi_o) {
335 cp1251_score += 10;
336 } else if (cp1251_o || koi_o) {
337 koi_score += 10;
338 }
339
340 if (cp1251_i > koi_i) {
341 cp1251_score += 10;
342 } else if (cp1251_i || koi_i) {
343 koi_score += 10;
344 }
345
346 if (cp1251_s > koi_s) {
347 cp1251_score += 10;
348 } else if (cp1251_s || koi_s) {
349 koi_score += 10;
350 }
351
352 if (cp1251_a_capital > koi_a_capital) {
353 cp1251_score += 9;
354 } else if (cp1251_a_capital || koi_a_capital) {
355 koi_score += 9;
356 }
357
358 if (cp1251_o_capital > koi_o_capital) {
359 cp1251_score += 9;
360 } else if (cp1251_o_capital || koi_o_capital) {
361 koi_score += 9;
362 }
363
364 if (cp1251_i_capital > koi_i_capital) {
365 cp1251_score += 9;
366 } else if (cp1251_i_capital || koi_i_capital) {
367 koi_score += 9;
368 }
369
370 if (cp1251_s_capital > koi_s_capital) {
371 cp1251_score += 9;
372 } else if (cp1251_s_capital || koi_s_capital) {
373 koi_score += 9;
374 }
375 #ifdef DECODE_DEBUG
376 qCWarning(KHTML_LOG) << "koi_score " << koi_score << " cp1251_score " << cp1251_score;
377 #endif
378 if (abs(koi_score - cp1251_score) < 10) {
379 //fallback...
380 cp1251_score = cp1251_small_range;
381 koi_score = koi_small_range;
382 }
383 if (cp1251_score > koi_score) {
384 return "cp1251";
385 } else {
386 return "koi8-u";
387 }
388
389 // if (cp1251_score>koi_score)
390 // setEncoding("cp1251",AutoDetectedEncoding);
391 // else
392 // setEncoding("koi8-u",AutoDetectedEncoding);
393 // return true;
394
395 }
396
automaticDetectionForGreek(const unsigned char * ptr,int size)397 static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size)
398 {
399 for (int i = 0; i < size; ++i) {
400 if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
401 || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
402 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE) {
403 return "cp1253";
404 }
405 }
406
407 return "iso-8859-7";
408 }
409
automaticDetectionForHebrew(const unsigned char * ptr,int size)410 static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size)
411 {
412 for (int i = 0; i < size; ++i) {
413 if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89) || ptr[ i ] == 0x8B
414 || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || (ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9)
415 || (ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8)) {
416 return "cp1255";
417 }
418
419 if (ptr[ i ] == 0xDF) {
420 return "iso-8859-8-i";
421 }
422 }
423
424 return "iso-8859-8-i";
425 }
426
automaticDetectionForJapanese(const unsigned char * ptr,int size)427 static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size)
428 {
429 JapaneseCode kc;
430
431 switch (kc.guess_jp((const char *)ptr, size)) {
432 case JapaneseCode::JIS:
433 return "jis7";
434 case JapaneseCode::EUC:
435 return "eucjp";
436 case JapaneseCode::SJIS:
437 return "sjis";
438 case JapaneseCode::UTF8:
439 return "utf8";
440 default:
441 break;
442 }
443
444 return "";
445 }
446
automaticDetectionForTurkish(const unsigned char * ptr,int size)447 static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size)
448 {
449 for (int i = 0; i < size; ++i) {
450 if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C) || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C) || ptr[ i ] == 0x9F) {
451 return "cp1254";
452 }
453 }
454
455 return "iso-8859-9";
456 }
457
automaticDetectionForWesternEuropean(const unsigned char * ptr,int size)458 static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size)
459 {
460 --size;
461 uint nonansi_count = 0;
462 for (int i = 0; i < size; ++i) {
463 if (ptr[i] > 0x79) {
464 ++nonansi_count;
465 if (ptr[i] > 0xc1 && ptr[i] < 0xf0 && ptr[i + 1] > 0x7f && ptr[i + 1] < 0xc0) {
466 return "UTF-8";
467 }
468 if (ptr[i] >= 0x78 && ptr[i] <= 0x9F) {
469 return "cp1252";
470 }
471 }
472
473 }
474
475 if (nonansi_count > 0) {
476 return "iso-8859-15";
477 }
478
479 return "";
480 }
481
482 // Other browsers allow comments in the head section, so we need to also.
483 // It's important not to look for tags inside the comments.
skipComment(const char * & ptr,const char * pEnd)484 static void skipComment(const char *&ptr, const char *pEnd)
485 {
486 const char *p = ptr;
487 // Allow <!-->; other browsers do.
488 if (*p == '>') {
489 p++;
490 } else {
491 while (p != pEnd) {
492 if (*p == '-') {
493 // This is the real end of comment, "-->".
494 if (p[1] == '-' && p[2] == '>') {
495 p += 3;
496 break;
497 }
498 // This is the incorrect end of comment that other browsers allow, "--!>".
499 if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
500 p += 4;
501 break;
502 }
503 }
504 p++;
505 }
506 }
507 ptr = p;
508 }
509
510 // Returns the position of the encoding string.
findXMLEncoding(const QByteArray & str,int & encodingLength)511 static int findXMLEncoding(const QByteArray &str, int &encodingLength)
512 {
513 int len = str.length();
514 int pos = str.indexOf("encoding");
515 if (pos == -1) {
516 return -1;
517 }
518 pos += 8;
519
520 // Skip spaces and stray control characters.
521 while (pos < len && str[pos] <= ' ') {
522 ++pos;
523 }
524
525 //Bail out if nothing after
526 // Skip equals sign.
527 if (pos >= len || str[pos] != '=') {
528 return -1;
529 }
530 ++pos;
531
532 // Skip spaces and stray control characters.
533 while (pos < len && str[pos] <= ' ') {
534 ++pos;
535 }
536
537 //Bail out if nothing after
538 if (pos >= len) {
539 return -1;
540 }
541
542 // Skip quotation mark.
543 char quoteMark = str[pos];
544 if (quoteMark != '"' && quoteMark != '\'') {
545 return -1;
546 }
547 ++pos;
548
549 // Find the trailing quotation mark.
550 int end = pos;
551 while (end < len && str[end] != quoteMark) {
552 ++end;
553 }
554
555 if (end >= len) {
556 return -1;
557 }
558
559 encodingLength = end - pos;
560 return pos;
561 }
562
processNull(char * data,int len)563 bool KEncodingDetector::processNull(char *data, int len)
564 {
565 bool bin = false;
566 if (is16Bit(d->m_codec)) {
567 for (int i = 1; i < len; i += 2) {
568 if ((data[i] == '\0') && (data[i - 1] == '\0')) {
569 bin = true;
570 data[i] = ' ';
571 }
572 }
573 return bin;
574 }
575 // replace '\0' by spaces, for buggy pages
576 int i = len - 1;
577 while (--i >= 0) {
578 if (data[i] == 0) {
579 bin = true;
580 data[i] = ' ';
581 }
582 }
583 return bin;
584 }
585
errorsIfUtf8(const char * data,int length)586 bool KEncodingDetector::errorsIfUtf8(const char *data, int length)
587 {
588 if (d->m_codec->mibEnum() != MibUtf8) {
589 return false; //means no errors
590 }
591 // #define highest1Bits (unsigned char)0x80
592 // #define highest2Bits (unsigned char)0xC0
593 // #define highest3Bits (unsigned char)0xE0
594 // #define highest4Bits (unsigned char)0xF0
595 // #define highest5Bits (unsigned char)0xF8
596 static const unsigned char highest1Bits = 0x80;
597 static const unsigned char highest2Bits = 0xC0;
598 static const unsigned char highest3Bits = 0xE0;
599 static const unsigned char highest4Bits = 0xF0;
600 static const unsigned char highest5Bits = 0xF8;
601
602 for (int i = 0; i < length; ++i) {
603 unsigned char c = data[i];
604
605 if (d->m_multiByte > 0) {
606 if ((c & highest2Bits) == 0x80) {
607 --(d->m_multiByte);
608 continue;
609 }
610 #ifdef DECODE_DEBUG
611 qCWarning(KHTML_LOG) << "EncDetector: Broken UTF8";
612 #endif
613 return true;
614 }
615
616 // most significant bit zero, single char
617 if ((c & highest1Bits) == 0x00) {
618 continue;
619 }
620
621 // 110xxxxx => init 1 following bytes
622 if ((c & highest3Bits) == 0xC0) {
623 d->m_multiByte = 1;
624 continue;
625 }
626
627 // 1110xxxx => init 2 following bytes
628 if ((c & highest4Bits) == 0xE0) {
629 d->m_multiByte = 2;
630 continue;
631 }
632
633 // 11110xxx => init 3 following bytes
634 if ((c & highest5Bits) == 0xF0) {
635 d->m_multiByte = 3;
636 continue;
637 }
638 #ifdef DECODE_DEBUG
639 qCWarning(KHTML_LOG) << "EncDetector:_Broken UTF8";
640 #endif
641 return true;
642 }
643 return false;
644 }
645
KEncodingDetector()646 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate)
647 {
648 }
649
KEncodingDetector(QTextCodec * codec,EncodingChoiceSource source,AutoDetectScript script)650 KEncodingDetector::KEncodingDetector(QTextCodec *codec, EncodingChoiceSource source, AutoDetectScript script) :
651 d(new KEncodingDetectorPrivate(codec, source, script))
652 {
653 }
654
~KEncodingDetector()655 KEncodingDetector::~KEncodingDetector()
656 {
657 delete d;
658 }
659
setAutoDetectLanguage(KEncodingDetector::AutoDetectScript lang)660 void KEncodingDetector::setAutoDetectLanguage(KEncodingDetector::AutoDetectScript lang)
661 {
662 d->m_autoDetectLanguage = lang;
663 }
autoDetectLanguage() const664 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const
665 {
666 return d->m_autoDetectLanguage;
667 }
668
encodingChoiceSource() const669 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const
670 {
671 return d->m_source;
672 }
673
encoding() const674 const char *KEncodingDetector::encoding() const
675 {
676 d->m_storeDecoderName = d->m_codec->name();
677 return d->m_storeDecoderName.constData();
678 }
679
visuallyOrdered() const680 bool KEncodingDetector::visuallyOrdered() const
681 {
682 return d->m_visualRTL;
683 }
684
685 // const QTextCodec* KEncodingDetector::codec() const
686 // {
687 // return d->m_codec;
688 // }
689
decoder()690 QTextDecoder *KEncodingDetector::decoder()
691 {
692 return d->m_decoder;
693 }
694
resetDecoder()695 void KEncodingDetector::resetDecoder()
696 {
697 assert(d->m_defaultCodec);
698 d->m_bufferForDefferedEncDetection.clear();
699 d->m_writtingHappened = false;
700 d->m_analyzeCalled = false;
701 d->m_multiByte = 0;
702 delete d->m_decoder;
703 if (!d->m_codec) {
704 d->m_codec = d->m_defaultCodec;
705 }
706 d->m_decoder = d->m_codec->makeDecoder();
707 }
708
setEncoding(const char * _encoding,EncodingChoiceSource type)709 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
710 {
711 QTextCodec *codec;
712 QByteArray enc(_encoding);
713 if (/*enc.isNull() || */enc.isEmpty()) {
714 if (type == DefaultEncoding) {
715 codec = d->m_defaultCodec;
716 } else {
717 return false;
718 }
719 } else {
720 //QString->QTextCodec
721
722 enc = enc.toLower();
723 // hebrew visually ordered
724 if (enc == "visual") {
725 enc = "iso8859-8";
726 }
727 bool b;
728 codec = KCharsets::charsets()->codecForName(QLatin1String(enc.data()), b);
729 if (!b) {
730 return false;
731 }
732 }
733
734 if (d->m_codec->mibEnum() == codec->mibEnum()) {
735 // We already have the codec, but we still want to re-set the type,
736 // as we may have overwritten a default with a detected
737 d->m_source = type;
738 return true;
739 }
740
741 if ((type == EncodingFromMetaTag || type == EncodingFromXMLHeader) && is16Bit(codec)) {
742 //Sometimes the codec specified is absurd, i.e. UTF-16 despite
743 //us decoding a meta tag as ASCII. In that case, ignore it.
744 return false;
745 }
746
747 if (codec->mibEnum() == Mib8859_8) {
748 //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
749 codec = QTextCodec::codecForName("iso8859-8-i");
750
751 // visually ordered unless one of the following
752 if (!(enc == "iso-8859-8-i" || enc == "iso_8859-8-i" || enc == "csiso88598i" || enc == "logical")) {
753 d->m_visualRTL = true;
754 }
755 }
756
757 d->m_codec = codec;
758 d->m_source = type;
759 delete d->m_decoder;
760 d->m_decoder = d->m_codec->makeDecoder();
761 #ifdef DECODE_DEBUG
762 qCDebug(KHTML_LOG) << "KEncodingDetector::encoding used is" << d->m_codec->name();
763 #endif
764 return true;
765 }
766
decode(const char * data,int len)767 QString KEncodingDetector::decode(const char *data, int len)
768 {
769 processNull(const_cast<char *>(data), len);
770 if (!d->m_analyzeCalled) {
771 analyze(data, len);
772 d->m_analyzeCalled = true;
773 }
774
775 return d->m_decoder->toUnicode(data, len);
776 }
777
decode(const QByteArray & data)778 QString KEncodingDetector::decode(const QByteArray &data)
779 {
780 processNull(const_cast<char *>(data.data()), data.size());
781 if (!d->m_analyzeCalled) {
782 analyze(data.data(), data.size());
783 d->m_analyzeCalled = true;
784 }
785
786 return d->m_decoder->toUnicode(data);
787 }
788
decodeWithBuffering(const char * data,int len)789 QString KEncodingDetector::decodeWithBuffering(const char *data, int len)
790 {
791 #ifdef DECODE_DEBUG
792 qCWarning(KHTML_LOG) << "KEncodingDetector: decoding " << len << " bytes";
793 #endif
794 if (d->m_writtingHappened) {
795 #ifdef DECODE_DEBUG
796 qCWarning(KHTML_LOG) << "KEncodingDetector: d->m_writtingHappened " << d->m_codec->name();
797 #endif
798 processNull(const_cast<char *>(data), len);
799 return d->m_decoder->toUnicode(data, len);
800 } else {
801 if (d->m_bufferForDefferedEncDetection.isEmpty()) {
802 // If encoding detection produced something, and we either got to the body or
803 // actually saw the encoding explicitly, we're done.
804 if (analyze(data, len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) {
805 #ifdef DECODE_DEBUG
806 qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened first time " << d->m_codec->name();
807 #endif
808 processNull(const_cast<char *>(data), len);
809 d->m_writtingHappened = true;
810 return d->m_decoder->toUnicode(data, len);
811 } else {
812 #ifdef DECODE_DEBUG
813 qCWarning(KHTML_LOG) << "KEncodingDetector: begin deffer";
814 #endif
815 d->m_bufferForDefferedEncDetection = data;
816 }
817 } else {
818 d->m_bufferForDefferedEncDetection += data;
819 // As above, but also limit the buffer size. We must use the entire buffer here,
820 // since the boundaries might split the meta tag, etc.
821 bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
822 if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
823 d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER) {
824 d->m_writtingHappened = true;
825 d->m_bufferForDefferedEncDetection.replace('\0', ' ');
826 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
827 d->m_bufferForDefferedEncDetection.clear();
828 #ifdef DECODE_DEBUG
829 qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
830 #endif
831 return result;
832 }
833 }
834 }
835
836 return QString();
837 }
838
decodedInvalidCharacters() const839 bool KEncodingDetector::decodedInvalidCharacters() const
840 {
841 return d->m_decoder ? d->m_decoder->hasFailure() : false;
842 }
843
flush()844 QString KEncodingDetector::flush()
845 {
846 if (d->m_bufferForDefferedEncDetection.isEmpty()) {
847 return QString();
848 }
849
850 d->m_bufferForDefferedEncDetection.replace('\0', ' ');
851 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
852 d->m_bufferForDefferedEncDetection.clear();
853 #ifdef DECODE_DEBUG
854 qCWarning(KHTML_LOG) << "KEncodingDetector:flush() " << d->m_bufferForDefferedEncDetection.length() << " bytes " << d->m_codec->name();
855 #endif
856 return result;
857 }
858
analyze(const char * data,int len)859 bool KEncodingDetector::analyze(const char *data, int len)
860 {
861 // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
862 // maximumBOMLength = 10
863 // Even if the user has chosen utf16 we still need to auto-detect the endianness
864 if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) {
865 // Extract the first three bytes.
866 const uchar *udata = (const uchar *)data;
867 uchar c1 = *udata++;
868 uchar c2 = *udata++;
869 uchar c3 = *udata++;
870
871 // Check for the BOM
872 const char *autoDetectedEncoding;
873 if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
874 autoDetectedEncoding = "UTF-16";
875 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
876 autoDetectedEncoding = "UTF-8";
877 } else if (c1 == 0x00 || c2 == 0x00) {
878 uchar c4 = *udata++;
879 uchar c5 = *udata++;
880 uchar c6 = *udata++;
881 uchar c7 = *udata++;
882 uchar c8 = *udata++;
883 uchar c9 = *udata++;
884 uchar c10 = *udata++;
885
886 int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
887 int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
888 if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) {
889 autoDetectedEncoding = "UTF-16";
890 } else {
891 autoDetectedEncoding = nullptr;
892 }
893 } else {
894 autoDetectedEncoding = nullptr;
895 }
896
897 // If we found a BOM, use the encoding it implies.
898 if (autoDetectedEncoding != nullptr) {
899 d->m_source = BOM;
900 d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
901 assert(d->m_codec);
902 //enc = d->m_codec->name();
903 delete d->m_decoder;
904 d->m_decoder = d->m_codec->makeDecoder();
905 #ifdef DECODE_DEBUG
906 qCWarning(KHTML_LOG) << "Detection by BOM";
907 #endif
908 if (is16Bit(d->m_codec) && c2 == 0x00) {
909 // utf16LE, we need to put the decoder in LE mode
910 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
911 d->m_decoder->toUnicode(reverseUtf16, 2);
912 }
913 return true;
914 }
915 }
916
917 //exit from routine in case it was called to only detect byte order for utf-16
918 if (d->m_source == UserChosenEncoding) {
919 #ifdef DECODE_DEBUG
920 qCWarning(KHTML_LOG) << "KEncodingDetector: UserChosenEncoding exit ";
921 #endif
922
923 if (errorsIfUtf8(data, len)) {
924 setEncoding("", DefaultEncoding);
925 }
926 return true;
927 }
928
929 // HTTP header takes precedence over meta-type stuff
930 if (d->m_source == EncodingFromHTTPHeader) {
931 return true;
932 }
933
934 if (!d->m_seenBody) {
935 // we still don't have an encoding, and are in the head
936 // the following tags are allowed in <head>:
937 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
938 const char *ptr = data;
939 const char *pEnd = data + len;
940
941 while (ptr != pEnd) {
942 if (*ptr != '<') {
943 ++ptr;
944 continue;
945 }
946 ++ptr;
947 // Handle comments.
948 if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
949 ptr += 3;
950 skipComment(ptr, pEnd);
951 continue;
952 }
953
954 // Handle XML header, which can have encoding in it.
955 if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
956 const char *end = ptr;
957 while (*end != '>' && end < pEnd) {
958 end++;
959 }
960 if (*end == '\0' || end == pEnd) {
961 break;
962 }
963 QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator
964 int length;
965 int pos = findXMLEncoding(str, length);
966 // also handles the case when specified encoding aint correct
967 if (pos != -1 && setEncoding(str.mid(pos, length).data(), EncodingFromXMLHeader)) {
968 return true;
969 }
970 }
971
972 //look for <meta>, stop if we reach <body>
973 while (
974 !(((*ptr >= 'a') && (*ptr <= 'z')) ||
975 ((*ptr >= 'A') && (*ptr <= 'Z')))
976 && ptr < pEnd
977 ) {
978 ++ptr;
979 }
980
981 char tmp[5];
982 int length = 0;
983 const char *max = ptr + 4;
984 if (pEnd < max) {
985 max = pEnd;
986 }
987 while (
988 (((*ptr >= 'a') && (*ptr <= 'z')) ||
989 ((*ptr >= 'A') && (*ptr <= 'Z')) ||
990 ((*ptr >= '0') && (*ptr <= '9')))
991 && ptr < max
992 ) {
993 tmp[length] = tolower(*ptr);
994 ++ptr;
995 ++length;
996 }
997 tmp[length] = 0;
998 if (tmp[0] == 'm' && tmp[1] == 'e' && tmp[2] == 't' && tmp[3] == 'a') {
999 // found a meta tag...
1000 const char *end = ptr;
1001 while (*end != '>' && *end != '\0' && end < pEnd) {
1002 end++;
1003 }
1004 //if ( *end == '\0' ) break;
1005 const QByteArray str = QByteArray(ptr, (end - ptr) + 1).toLower();
1006 const int strLength = str.length();
1007 int pos = 0;
1008 //if( (pos = str.find("http-equiv", pos)) == -1) break;
1009 //if( (pos = str.find("content-type", pos)) == -1) break;
1010 if ((pos = str.indexOf("charset")) == -1) {
1011 continue;
1012 }
1013 pos += 6;
1014 // skip to '='
1015 if ((pos = str.indexOf("=", pos)) == -1) {
1016 continue;
1017 }
1018
1019 // skip '='
1020 ++pos;
1021
1022 // skip whitespace before encoding itself
1023 while (pos < strLength && str[pos] <= ' ') {
1024 ++pos;
1025 }
1026
1027 // there may also be an opening quote, if this is a charset= and not a http-equiv.
1028 if (pos < strLength && (str[pos] == '"' || str[pos] == '\'')) {
1029 ++pos;
1030 }
1031
1032 // skip whitespace
1033 while (pos < strLength && str[pos] <= ' ') {
1034 ++pos;
1035 }
1036
1037 if (pos == strLength) {
1038 continue;
1039 }
1040
1041 int endpos = pos;
1042 while (endpos < strLength &&
1043 (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1044 && str[endpos] != ';' && str[endpos] != '>')) {
1045 ++endpos;
1046 }
1047 #ifdef DECODE_DEBUG
1048 qCDebug(KHTML_LOG) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos, endpos - pos).data();
1049 #endif
1050 if (setEncoding(str.mid(pos, endpos - pos).data(), EncodingFromMetaTag)) {
1051 return true;
1052 }
1053 } else if (tmp[0] == 'b' && tmp[1] == 'o' && tmp[2] == 'd' && tmp[3] == 'y') {
1054 d->m_seenBody = true;
1055 break;
1056 }
1057 }
1058 }
1059
1060 if (len < 20) {
1061 return false;
1062 }
1063
1064 #ifdef DECODE_DEBUG
1065 qCDebug(KHTML_LOG) << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
1066 #endif
1067
1068 switch (d->m_autoDetectLanguage) {
1069 case KEncodingDetector::Arabic:
1070 return setEncoding(automaticDetectionForArabic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1071 // break;
1072 case KEncodingDetector::Baltic:
1073 return setEncoding(automaticDetectionForBaltic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1074 // break;
1075 case KEncodingDetector::CentralEuropean:
1076 return setEncoding(automaticDetectionForCentralEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1077 // break;
1078 case KEncodingDetector::Cyrillic:
1079 return setEncoding(automaticDetectionForCyrillic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1080 // break;
1081 case KEncodingDetector::Greek:
1082 return setEncoding(automaticDetectionForGreek((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1083 // break;
1084 case KEncodingDetector::Hebrew:
1085 return setEncoding(automaticDetectionForHebrew((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1086 // break;
1087 case KEncodingDetector::Japanese:
1088 return setEncoding(automaticDetectionForJapanese((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1089 // break;
1090 case KEncodingDetector::Turkish:
1091 return setEncoding(automaticDetectionForTurkish((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1092 // break;
1093 case KEncodingDetector::WesternEuropean:
1094 if (setEncoding(automaticDetectionForWesternEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding)) {
1095 return true;
1096 } else if (d->m_defaultCodec->mibEnum() == MibLatin1) { //detection for khtml
1097 return setEncoding("iso-8859-15", AutoDetectedEncoding);
1098 } else { //use default provided by eg katepart
1099 return setEncoding("", DefaultEncoding);
1100 }
1101 // break;
1102 case KEncodingDetector::SemiautomaticDetection:
1103 case KEncodingDetector::ChineseSimplified:
1104 case KEncodingDetector::ChineseTraditional:
1105 case KEncodingDetector::Korean:
1106 case KEncodingDetector::Thai:
1107 case KEncodingDetector::Unicode:
1108 case KEncodingDetector::NorthernSaami:
1109 case KEncodingDetector::SouthEasternEurope:
1110 case KEncodingDetector::None:
1111 // huh. somethings broken in this code ### FIXME
1112 //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1113 break;
1114 }
1115
1116 return true;
1117 }
1118
scriptForName(const QString & lang)1119 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString &lang)
1120 {
1121 if (lang.isEmpty()) {
1122 return KEncodingDetector::None;
1123 } else if (lang == i18nc("@item Text character set", "Unicode")) {
1124 return KEncodingDetector::Unicode;
1125 } else if (lang == i18nc("@item Text character set", "Cyrillic")) {
1126 return KEncodingDetector::Cyrillic;
1127 } else if (lang == i18nc("@item Text character set", "Western European")) {
1128 return KEncodingDetector::WesternEuropean;
1129 } else if (lang == i18nc("@item Text character set", "Central European")) {
1130 return KEncodingDetector::CentralEuropean;
1131 } else if (lang == i18nc("@item Text character set", "Greek")) {
1132 return KEncodingDetector::Greek;
1133 } else if (lang == i18nc("@item Text character set", "Hebrew")) {
1134 return KEncodingDetector::Hebrew;
1135 } else if (lang == i18nc("@item Text character set", "Turkish")) {
1136 return KEncodingDetector::Turkish;
1137 } else if (lang == i18nc("@item Text character set", "Japanese")) {
1138 return KEncodingDetector::Japanese;
1139 } else if (lang == i18nc("@item Text character set", "Baltic")) {
1140 return KEncodingDetector::Baltic;
1141 } else if (lang == i18nc("@item Text character set", "Arabic")) {
1142 return KEncodingDetector::Arabic;
1143 }
1144
1145 return KEncodingDetector::None;
1146 }
1147
hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)1148 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)
1149 {
1150 switch (script) {
1151 case KEncodingDetector::Arabic:
1152 return true;
1153 case KEncodingDetector::Baltic:
1154 return true;
1155 case KEncodingDetector::CentralEuropean:
1156 return true;
1157 case KEncodingDetector::Cyrillic:
1158 return true;
1159 case KEncodingDetector::Greek:
1160 return true;
1161 case KEncodingDetector::Hebrew:
1162 return true;
1163 case KEncodingDetector::Japanese:
1164 return true;
1165 case KEncodingDetector::Turkish:
1166 return true;
1167 case KEncodingDetector::WesternEuropean:
1168 return true;
1169 case KEncodingDetector::ChineseTraditional:
1170 return true;
1171 case KEncodingDetector::ChineseSimplified:
1172 return true;
1173 case KEncodingDetector::Unicode:
1174 return true;
1175 break;
1176 default:
1177 return false;
1178 }
1179 }
1180
nameForScript(KEncodingDetector::AutoDetectScript script)1181 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script)
1182 {
1183 switch (script) {
1184 case KEncodingDetector::Arabic:
1185 return i18nc("@item Text character set", "Arabic");
1186 break;
1187 case KEncodingDetector::Baltic:
1188 return i18nc("@item Text character set", "Baltic");
1189 break;
1190 case KEncodingDetector::CentralEuropean:
1191 return i18nc("@item Text character set", "Central European");
1192 break;
1193 case KEncodingDetector::Cyrillic:
1194 return i18nc("@item Text character set", "Cyrillic");
1195 break;
1196 case KEncodingDetector::Greek:
1197 return i18nc("@item Text character set", "Greek");
1198 break;
1199 case KEncodingDetector::Hebrew:
1200 return i18nc("@item Text character set", "Hebrew");
1201 break;
1202 case KEncodingDetector::Japanese:
1203 return i18nc("@item Text character set", "Japanese");
1204 break;
1205 case KEncodingDetector::Turkish:
1206 return i18nc("@item Text character set", "Turkish");
1207 break;
1208 case KEncodingDetector::WesternEuropean:
1209 return i18nc("@item Text character set", "Western European");
1210 break;
1211 case KEncodingDetector::ChineseTraditional:
1212 return i18nc("@item Text character set", "Chinese Traditional");
1213 break;
1214 case KEncodingDetector::ChineseSimplified:
1215 return i18nc("@item Text character set", "Chinese Simplified");
1216 break;
1217 case KEncodingDetector::Korean:
1218 return i18nc("@item Text character set", "Korean");
1219 break;
1220 case KEncodingDetector::Thai:
1221 return i18nc("@item Text character set", "Thai");
1222 break;
1223 case KEncodingDetector::Unicode:
1224 return i18nc("@item Text character set", "Unicode");
1225 break;
1226 //case KEncodingDetector::SemiautomaticDetection:
1227 default:
1228 return QString();
1229
1230 }
1231 }
1232
1233 #undef DECODE_DEBUG
1234
1235