1 /*
2     This file is part of the KDE libraries
3 
4     Copyright (C) 1999 Lars Knoll (knoll@kde.org)
5     Copyright (C) 2003 Dirk Mueller (mueller@kde.org)
6     Copyright (C) 2003 Apple Computer, Inc.
7     Copyright (C) 2007 Nick Shaforostoff (shafff@ukr.net)
8 
9     This library is free software; you can redistribute it and/or
10     modify it under the terms of the GNU Library General Public
11     License as published by the Free Software Foundation; either
12     version 2 of the License, or (at your option) any later version.
13 
14     This library is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17     Library General Public License for more details.
18 
19     You should have received a copy of the GNU Library General Public License
20     along with this library; see the file COPYING.LIB.  If not, write to
21     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
22     Boston, MA 02110-1301, USA.
23 */
24 //----------------------------------------------------------------------------
25 //
26 // decoder for input stream
27 
28 #include "kencodingdetector.h"
29 
30 #undef DECODE_DEBUG
31 //#define DECODE_DEBUG
32 
33 #define MAX_BUFFER 16*1024
34 
35 #include <assert.h>
36 
37 #include "guess_ja_p.h"
38 
39 #include "khtml_debug.h"
40 #include <QRegExp>
41 #include <QTextCodec>
42 
43 #include "kcharsets.h"
44 #include <klocalizedstring.h>
45 
46 #include <ctype.h>
47 
48 enum MIB {
49     MibLatin1  = 4,
50     Mib8859_8  = 85,
51     MibUtf8    = 106,
52     MibUcs2    = 1000,
53     MibUtf16   = 1015,
54     MibUtf16BE = 1013,
55     MibUtf16LE = 1014
56 };
57 
is16Bit(QTextCodec * codec)58 static bool is16Bit(QTextCodec *codec)
59 {
60     switch (codec->mibEnum()) {
61     case MibUtf16:
62     case MibUtf16BE:
63     case MibUtf16LE:
64     case MibUcs2:
65         return true;
66     default:
67         return false;
68     }
69 }
70 
71 class KEncodingDetectorPrivate
72 {
73 public:
74     QTextCodec *m_codec;
75     QTextDecoder *m_decoder; // utf16
76     QTextCodec *m_defaultCodec;
77     QByteArray  m_storeDecoderName;
78 
79     KEncodingDetector::EncodingChoiceSource m_source;
80     KEncodingDetector::AutoDetectScript m_autoDetectLanguage;
81 
82     bool m_visualRTL : 1;
83     bool m_seenBody : 1;
84     bool m_writtingHappened : 1;
85     bool m_analyzeCalled : 1; //for decode()
86     int m_multiByte;
87 
88     QByteArray m_bufferForDefferedEncDetection;
89 
KEncodingDetectorPrivate()90     KEncodingDetectorPrivate()
91         : m_codec(QTextCodec::codecForMib(MibLatin1))
92         , m_decoder(m_codec->makeDecoder())
93         , m_defaultCodec(m_codec)
94         , m_source(KEncodingDetector::DefaultEncoding)
95         , m_autoDetectLanguage(KEncodingDetector::SemiautomaticDetection)
96         , m_visualRTL(false)
97         , m_seenBody(false)
98         , m_writtingHappened(false)
99         , m_analyzeCalled(false)
100         , m_multiByte(0)
101     {
102     }
103 
KEncodingDetectorPrivate(QTextCodec * codec,KEncodingDetector::EncodingChoiceSource source,KEncodingDetector::AutoDetectScript script)104     KEncodingDetectorPrivate(QTextCodec *codec, KEncodingDetector::EncodingChoiceSource source, KEncodingDetector::AutoDetectScript script)
105         : m_codec(codec)
106         , m_decoder(m_codec->makeDecoder())
107         , m_defaultCodec(m_codec)
108         , m_source(source)
109         , m_autoDetectLanguage(script)
110         , m_visualRTL(false)
111         , m_seenBody(false)
112         , m_writtingHappened(false)
113         , m_analyzeCalled(false)
114         , m_multiByte(0)
115     {
116     }
117 
~KEncodingDetectorPrivate()118     ~KEncodingDetectorPrivate()
119     {
120         delete m_decoder;
121     }
122 
123     // Returns true if the encoding was explicitly specified someplace.
isExplicitlySpecifiedEncoding()124     bool isExplicitlySpecifiedEncoding()
125     {
126         return m_source != KEncodingDetector::DefaultEncoding && m_source != KEncodingDetector::AutoDetectedEncoding;
127     }
128 };
129 
automaticDetectionForArabic(const unsigned char * ptr,int size)130 static QByteArray automaticDetectionForArabic(const unsigned char *ptr, int size)
131 {
132     for (int i = 0; i < size; ++i) {
133         if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3
134                 || (ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB) || (ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA)
135                 || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0
136                 || (ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF) || (ptr[ i ] >= 0xF3)) {
137             return "cp1256";
138         }
139     }
140 
141     return "iso-8859-6";
142 }
143 
automaticDetectionForBaltic(const unsigned char * ptr,int size)144 static QByteArray automaticDetectionForBaltic(const unsigned char *ptr, int size)
145 {
146     for (int i = 0; i < size; ++i) {
147         if ((ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E)) {
148             return "cp1257";
149         }
150 
151         if (ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5) {
152             return "iso-8859-13";
153         }
154     }
155 
156     return "iso-8859-13";
157 }
158 
automaticDetectionForCentralEuropean(const unsigned char * ptr,int size)159 static QByteArray automaticDetectionForCentralEuropean(const unsigned char *ptr, int size)
160 {
161     QByteArray charset = QByteArray();
162     for (int i = 0; i < size; ++i) {
163         if (ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F) {
164             if (ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98) {
165                 return "ibm852";
166             }
167 
168             if (i + 1 > size) {
169                 return "cp1250";
170             } else { // maybe ibm852 ?
171                 charset = "cp1250";
172                 continue;
173             }
174         }
175         if (ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0) {
176             if (i + 1 > size) {
177                 return "iso-8859-2";
178             } else { // maybe ibm852 ?
179                 if (charset.isNull()) {
180                     charset = "iso-8859-2";
181                 }
182                 continue;
183             }
184         }
185     }
186 
187     if (charset.isNull()) {
188         charset = "iso-8859-3";
189     }
190 
191     return charset.data();
192 }
193 
automaticDetectionForCyrillic(const unsigned char * ptr,int size)194 static QByteArray automaticDetectionForCyrillic(const unsigned char *ptr, int size)
195 {
196 #ifdef DECODE_DEBUG
197     qCWarning(KHTML_LOG) << "KEncodingDetector: Cyr heuristics";
198 #endif
199 
200 //     if (ptr[0]==0xef && ptr[1]==0xbb && ptr[2]==0xbf)
201 //         return "utf8";
202     int utf8_mark = 0;
203     int koi_score = 0;
204     int cp1251_score = 0;
205 
206     int koi_st = 0;
207     int cp1251_st = 0;
208 
209 //     int koi_na=0;
210 //     int cp1251_na=0;
211 
212     int koi_o_capital = 0;
213     int koi_o = 0;
214     int cp1251_o_capital = 0;
215     int cp1251_o = 0;
216 
217     int koi_a_capital = 0;
218     int koi_a = 0;
219     int cp1251_a_capital = 0;
220     int cp1251_a = 0;
221 
222     int koi_s_capital = 0;
223     int koi_s = 0;
224     int cp1251_s_capital = 0;
225     int cp1251_s = 0;
226 
227     int koi_i_capital = 0;
228     int koi_i = 0;
229     int cp1251_i_capital = 0;
230     int cp1251_i = 0;
231 
232     int cp1251_small_range = 0;
233     int koi_small_range = 0;
234     int ibm866_small_range = 0;
235 
236     int i;
237     for (i = 1; (i < size) && (cp1251_small_range + koi_small_range < 1000); ++i) {
238         if (ptr[i] > 0xdf) {
239             ++cp1251_small_range;
240 
241             if (ptr[i] == 0xee) { //small o
242                 ++cp1251_o;
243             } else if (ptr[i] == 0xe0) { //small a
244                 ++cp1251_a;
245             } else if (ptr[i] == 0xe8) { //small i
246                 ++cp1251_i;
247             } else if (ptr[i] == 0xf1) { //small s
248                 ++cp1251_s;
249             } else if (ptr[i] == 0xf2 && ptr[i - 1] == 0xf1) { //small st
250                 ++cp1251_st;
251             }
252 
253             else if (ptr[i] == 0xef) {
254                 ++koi_o_capital;
255             } else if (ptr[i] == 0xe1) {
256                 ++koi_a_capital;
257             } else if (ptr[i] == 0xe9) {
258                 ++koi_i_capital;
259             } else if (ptr[i] == 0xf3) {
260                 ++koi_s_capital;
261             }
262 
263         } else if (ptr[i] > 0xbf) {
264             ++koi_small_range;
265 
266             if (ptr[i] == 0xd0 || ptr[i] == 0xd1) { //small o
267                 ++utf8_mark;
268             } else if (ptr[i] == 0xcf) { //small o
269                 ++koi_o;
270             } else if (ptr[i] == 0xc1) { //small a
271                 ++koi_a;
272             } else if (ptr[i] == 0xc9) { //small i
273                 ++koi_i;
274             } else if (ptr[i] == 0xd3) { //small s
275                 ++koi_s;
276             } else if (ptr[i] == 0xd4 && ptr[i - 1] == 0xd3) { //small st
277                 ++koi_st;
278             }
279 
280             else if (ptr[i] == 0xce) {
281                 ++cp1251_o_capital;
282             } else if (ptr[i] == 0xc0) {
283                 ++cp1251_a_capital;
284             } else if (ptr[i] == 0xc8) {
285                 ++cp1251_i_capital;
286             } else if (ptr[i] == 0xd1) {
287                 ++cp1251_s_capital;
288             }
289         } else if (ptr[i] > 0x9f && ptr[i] < 0xb0) { //first 16 letterz is 60%
290             ++ibm866_small_range;
291         }
292 
293     }
294 
295     //cannot decide?
296     if (cp1251_small_range + koi_small_range + ibm866_small_range < 8) {
297         return "";
298     }
299 
300     if (3 * utf8_mark > cp1251_small_range + koi_small_range + ibm866_small_range) {
301 #ifdef DECODE_DEBUG
302         qCWarning(KHTML_LOG) << "Cyr Enc Detection: UTF8";
303 #endif
304         return "UTF-8";
305     }
306 
307     if (ibm866_small_range > cp1251_small_range + koi_small_range) {
308         return "ibm866";
309     }
310 
311 //     QByteArray koi_string = "koi8-u";
312 //     QByteArray cp1251_string = "cp1251";
313 
314     if (cp1251_st == 0 && koi_st > 1) {
315         koi_score += 10;
316     } else if (koi_st == 0 && cp1251_st > 1) {
317         cp1251_score += 10;
318     }
319 
320     if (cp1251_st && koi_st) {
321         if (cp1251_st / koi_st > 2) {
322             cp1251_score += 20;
323         } else if (koi_st / cp1251_st > 2) {
324             koi_score += 20;
325         }
326     }
327 
328     if (cp1251_a > koi_a) {
329         cp1251_score += 10;
330     } else if (cp1251_a || koi_a) {
331         koi_score += 10;
332     }
333 
334     if (cp1251_o > koi_o) {
335         cp1251_score += 10;
336     } else if (cp1251_o || koi_o) {
337         koi_score += 10;
338     }
339 
340     if (cp1251_i > koi_i) {
341         cp1251_score += 10;
342     } else if (cp1251_i || koi_i) {
343         koi_score += 10;
344     }
345 
346     if (cp1251_s > koi_s) {
347         cp1251_score += 10;
348     } else if (cp1251_s || koi_s) {
349         koi_score += 10;
350     }
351 
352     if (cp1251_a_capital > koi_a_capital) {
353         cp1251_score += 9;
354     } else if (cp1251_a_capital || koi_a_capital) {
355         koi_score += 9;
356     }
357 
358     if (cp1251_o_capital > koi_o_capital) {
359         cp1251_score += 9;
360     } else if (cp1251_o_capital || koi_o_capital) {
361         koi_score += 9;
362     }
363 
364     if (cp1251_i_capital > koi_i_capital) {
365         cp1251_score += 9;
366     } else if (cp1251_i_capital || koi_i_capital) {
367         koi_score += 9;
368     }
369 
370     if (cp1251_s_capital > koi_s_capital) {
371         cp1251_score += 9;
372     } else if (cp1251_s_capital || koi_s_capital) {
373         koi_score += 9;
374     }
375 #ifdef DECODE_DEBUG
376     qCWarning(KHTML_LOG) << "koi_score " << koi_score << " cp1251_score " << cp1251_score;
377 #endif
378     if (abs(koi_score - cp1251_score) < 10) {
379         //fallback...
380         cp1251_score = cp1251_small_range;
381         koi_score = koi_small_range;
382     }
383     if (cp1251_score > koi_score) {
384         return "cp1251";
385     } else {
386         return "koi8-u";
387     }
388 
389 //     if (cp1251_score>koi_score)
390 //         setEncoding("cp1251",AutoDetectedEncoding);
391 //     else
392 //         setEncoding("koi8-u",AutoDetectedEncoding);
393 //     return true;
394 
395 }
396 
automaticDetectionForGreek(const unsigned char * ptr,int size)397 static QByteArray automaticDetectionForGreek(const unsigned char *ptr, int size)
398 {
399     for (int i = 0; i < size; ++i) {
400         if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B
401                 || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4
402                 || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE) {
403             return "cp1253";
404         }
405     }
406 
407     return "iso-8859-7";
408 }
409 
automaticDetectionForHebrew(const unsigned char * ptr,int size)410 static QByteArray automaticDetectionForHebrew(const unsigned char *ptr, int size)
411 {
412     for (int i = 0; i < size; ++i) {
413         if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89) || ptr[ i ] == 0x8B
414                 || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || (ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9)
415                 || (ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8)) {
416             return "cp1255";
417         }
418 
419         if (ptr[ i ] == 0xDF) {
420             return "iso-8859-8-i";
421         }
422     }
423 
424     return "iso-8859-8-i";
425 }
426 
automaticDetectionForJapanese(const unsigned char * ptr,int size)427 static QByteArray automaticDetectionForJapanese(const unsigned char *ptr, int size)
428 {
429     JapaneseCode kc;
430 
431     switch (kc.guess_jp((const char *)ptr, size)) {
432     case JapaneseCode::JIS:
433         return "jis7";
434     case JapaneseCode::EUC:
435         return "eucjp";
436     case JapaneseCode::SJIS:
437         return "sjis";
438     case JapaneseCode::UTF8:
439         return "utf8";
440     default:
441         break;
442     }
443 
444     return "";
445 }
446 
automaticDetectionForTurkish(const unsigned char * ptr,int size)447 static QByteArray automaticDetectionForTurkish(const unsigned char *ptr, int size)
448 {
449     for (int i = 0; i < size; ++i) {
450         if (ptr[ i ] == 0x80 || (ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C) || (ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C) || ptr[ i ] == 0x9F) {
451             return "cp1254";
452         }
453     }
454 
455     return "iso-8859-9";
456 }
457 
automaticDetectionForWesternEuropean(const unsigned char * ptr,int size)458 static QByteArray automaticDetectionForWesternEuropean(const unsigned char *ptr, int size)
459 {
460     --size;
461     uint nonansi_count = 0;
462     for (int i = 0; i < size; ++i) {
463         if (ptr[i] > 0x79) {
464             ++nonansi_count;
465             if (ptr[i] > 0xc1 && ptr[i] < 0xf0 && ptr[i + 1] > 0x7f && ptr[i + 1] < 0xc0) {
466                 return "UTF-8";
467             }
468             if (ptr[i] >= 0x78 && ptr[i] <= 0x9F) {
469                 return "cp1252";
470             }
471         }
472 
473     }
474 
475     if (nonansi_count > 0) {
476         return "iso-8859-15";
477     }
478 
479     return "";
480 }
481 
482 // Other browsers allow comments in the head section, so we need to also.
483 // It's important not to look for tags inside the comments.
skipComment(const char * & ptr,const char * pEnd)484 static void skipComment(const char *&ptr, const char *pEnd)
485 {
486     const char *p = ptr;
487     // Allow <!-->; other browsers do.
488     if (*p == '>') {
489         p++;
490     } else {
491         while (p != pEnd) {
492             if (*p == '-') {
493                 // This is the real end of comment, "-->".
494                 if (p[1] == '-' && p[2] == '>') {
495                     p += 3;
496                     break;
497                 }
498                 // This is the incorrect end of comment that other browsers allow, "--!>".
499                 if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
500                     p += 4;
501                     break;
502                 }
503             }
504             p++;
505         }
506     }
507     ptr = p;
508 }
509 
510 // Returns the position of the encoding string.
findXMLEncoding(const QByteArray & str,int & encodingLength)511 static int findXMLEncoding(const QByteArray &str, int &encodingLength)
512 {
513     int len = str.length();
514     int pos = str.indexOf("encoding");
515     if (pos == -1) {
516         return -1;
517     }
518     pos += 8;
519 
520     // Skip spaces and stray control characters.
521     while (pos < len && str[pos] <= ' ') {
522         ++pos;
523     }
524 
525     //Bail out if nothing after
526     // Skip equals sign.
527     if (pos >= len || str[pos] != '=') {
528         return -1;
529     }
530     ++pos;
531 
532     // Skip spaces and stray control characters.
533     while (pos < len && str[pos] <= ' ') {
534         ++pos;
535     }
536 
537     //Bail out if nothing after
538     if (pos >= len) {
539         return -1;
540     }
541 
542     // Skip quotation mark.
543     char quoteMark = str[pos];
544     if (quoteMark != '"' && quoteMark != '\'') {
545         return -1;
546     }
547     ++pos;
548 
549     // Find the trailing quotation mark.
550     int end = pos;
551     while (end < len && str[end] != quoteMark) {
552         ++end;
553     }
554 
555     if (end >= len) {
556         return -1;
557     }
558 
559     encodingLength = end - pos;
560     return pos;
561 }
562 
processNull(char * data,int len)563 bool KEncodingDetector::processNull(char *data, int len)
564 {
565     bool bin = false;
566     if (is16Bit(d->m_codec)) {
567         for (int i = 1; i < len; i += 2) {
568             if ((data[i] == '\0') && (data[i - 1] == '\0')) {
569                 bin = true;
570                 data[i] = ' ';
571             }
572         }
573         return bin;
574     }
575     // replace '\0' by spaces, for buggy pages
576     int i = len - 1;
577     while (--i >= 0) {
578         if (data[i] == 0) {
579             bin = true;
580             data[i] = ' ';
581         }
582     }
583     return bin;
584 }
585 
errorsIfUtf8(const char * data,int length)586 bool KEncodingDetector::errorsIfUtf8(const char *data, int length)
587 {
588     if (d->m_codec->mibEnum() != MibUtf8) {
589         return false;    //means no errors
590     }
591 // #define highest1Bits (unsigned char)0x80
592 // #define highest2Bits (unsigned char)0xC0
593 // #define highest3Bits (unsigned char)0xE0
594 // #define highest4Bits (unsigned char)0xF0
595 // #define highest5Bits (unsigned char)0xF8
596     static const unsigned char highest1Bits = 0x80;
597     static const unsigned char highest2Bits = 0xC0;
598     static const unsigned char highest3Bits = 0xE0;
599     static const unsigned char highest4Bits = 0xF0;
600     static const unsigned char highest5Bits = 0xF8;
601 
602     for (int i = 0; i < length; ++i) {
603         unsigned char c = data[i];
604 
605         if (d->m_multiByte > 0) {
606             if ((c & highest2Bits) == 0x80) {
607                 --(d->m_multiByte);
608                 continue;
609             }
610 #ifdef DECODE_DEBUG
611             qCWarning(KHTML_LOG) << "EncDetector: Broken UTF8";
612 #endif
613             return true;
614         }
615 
616         // most significant bit zero, single char
617         if ((c & highest1Bits) == 0x00) {
618             continue;
619         }
620 
621         // 110xxxxx => init 1 following bytes
622         if ((c & highest3Bits) == 0xC0) {
623             d->m_multiByte = 1;
624             continue;
625         }
626 
627         // 1110xxxx => init 2 following bytes
628         if ((c & highest4Bits) == 0xE0) {
629             d->m_multiByte = 2;
630             continue;
631         }
632 
633         // 11110xxx => init 3 following bytes
634         if ((c & highest5Bits) == 0xF0) {
635             d->m_multiByte = 3;
636             continue;
637         }
638 #ifdef DECODE_DEBUG
639         qCWarning(KHTML_LOG) << "EncDetector:_Broken UTF8";
640 #endif
641         return true;
642     }
643     return false;
644 }
645 
KEncodingDetector()646 KEncodingDetector::KEncodingDetector() : d(new KEncodingDetectorPrivate)
647 {
648 }
649 
KEncodingDetector(QTextCodec * codec,EncodingChoiceSource source,AutoDetectScript script)650 KEncodingDetector::KEncodingDetector(QTextCodec *codec, EncodingChoiceSource source, AutoDetectScript script) :
651     d(new KEncodingDetectorPrivate(codec, source, script))
652 {
653 }
654 
~KEncodingDetector()655 KEncodingDetector::~KEncodingDetector()
656 {
657     delete d;
658 }
659 
setAutoDetectLanguage(KEncodingDetector::AutoDetectScript lang)660 void KEncodingDetector::setAutoDetectLanguage(KEncodingDetector::AutoDetectScript lang)
661 {
662     d->m_autoDetectLanguage = lang;
663 }
autoDetectLanguage() const664 KEncodingDetector::AutoDetectScript KEncodingDetector::autoDetectLanguage() const
665 {
666     return d->m_autoDetectLanguage;
667 }
668 
encodingChoiceSource() const669 KEncodingDetector::EncodingChoiceSource KEncodingDetector::encodingChoiceSource() const
670 {
671     return d->m_source;
672 }
673 
encoding() const674 const char *KEncodingDetector::encoding() const
675 {
676     d->m_storeDecoderName = d->m_codec->name();
677     return d->m_storeDecoderName.constData();
678 }
679 
visuallyOrdered() const680 bool KEncodingDetector::visuallyOrdered() const
681 {
682     return d->m_visualRTL;
683 }
684 
685 // const QTextCodec* KEncodingDetector::codec() const
686 // {
687 //     return d->m_codec;
688 // }
689 
decoder()690 QTextDecoder *KEncodingDetector::decoder()
691 {
692     return d->m_decoder;
693 }
694 
resetDecoder()695 void KEncodingDetector::resetDecoder()
696 {
697     assert(d->m_defaultCodec);
698     d->m_bufferForDefferedEncDetection.clear();
699     d->m_writtingHappened = false;
700     d->m_analyzeCalled = false;
701     d->m_multiByte = 0;
702     delete d->m_decoder;
703     if (!d->m_codec) {
704         d->m_codec = d->m_defaultCodec;
705     }
706     d->m_decoder = d->m_codec->makeDecoder();
707 }
708 
setEncoding(const char * _encoding,EncodingChoiceSource type)709 bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
710 {
711     QTextCodec *codec;
712     QByteArray enc(_encoding);
713     if (/*enc.isNull() || */enc.isEmpty()) {
714         if (type == DefaultEncoding) {
715             codec = d->m_defaultCodec;
716         } else {
717             return false;
718         }
719     } else {
720         //QString->QTextCodec
721 
722         enc = enc.toLower();
723         // hebrew visually ordered
724         if (enc == "visual") {
725             enc = "iso8859-8";
726         }
727         bool b;
728         codec = KCharsets::charsets()->codecForName(QLatin1String(enc.data()), b);
729         if (!b) {
730             return false;
731         }
732     }
733 
734     if (d->m_codec->mibEnum() == codec->mibEnum()) {
735         // We already have the codec, but we still want to re-set the type,
736         // as we may have overwritten a default with a detected
737         d->m_source = type;
738         return true;
739     }
740 
741     if ((type == EncodingFromMetaTag || type == EncodingFromXMLHeader) && is16Bit(codec)) {
742         //Sometimes the codec specified is absurd, i.e. UTF-16 despite
743         //us decoding a meta tag as ASCII. In that case, ignore it.
744         return false;
745     }
746 
747     if (codec->mibEnum() == Mib8859_8) {
748         //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
749         codec = QTextCodec::codecForName("iso8859-8-i");
750 
751         // visually ordered unless one of the following
752         if (!(enc == "iso-8859-8-i" || enc == "iso_8859-8-i" || enc == "csiso88598i" || enc == "logical")) {
753             d->m_visualRTL = true;
754         }
755     }
756 
757     d->m_codec = codec;
758     d->m_source = type;
759     delete d->m_decoder;
760     d->m_decoder = d->m_codec->makeDecoder();
761 #ifdef DECODE_DEBUG
762     qCDebug(KHTML_LOG) << "KEncodingDetector::encoding used is" << d->m_codec->name();
763 #endif
764     return true;
765 }
766 
decode(const char * data,int len)767 QString KEncodingDetector::decode(const char *data, int len)
768 {
769     processNull(const_cast<char *>(data), len);
770     if (!d->m_analyzeCalled) {
771         analyze(data, len);
772         d->m_analyzeCalled = true;
773     }
774 
775     return d->m_decoder->toUnicode(data, len);
776 }
777 
decode(const QByteArray & data)778 QString KEncodingDetector::decode(const QByteArray &data)
779 {
780     processNull(const_cast<char *>(data.data()), data.size());
781     if (!d->m_analyzeCalled) {
782         analyze(data.data(), data.size());
783         d->m_analyzeCalled = true;
784     }
785 
786     return d->m_decoder->toUnicode(data);
787 }
788 
decodeWithBuffering(const char * data,int len)789 QString KEncodingDetector::decodeWithBuffering(const char *data, int len)
790 {
791 #ifdef DECODE_DEBUG
792     qCWarning(KHTML_LOG) << "KEncodingDetector: decoding " << len << " bytes";
793 #endif
794     if (d->m_writtingHappened) {
795 #ifdef DECODE_DEBUG
796         qCWarning(KHTML_LOG) << "KEncodingDetector: d->m_writtingHappened " << d->m_codec->name();
797 #endif
798         processNull(const_cast<char *>(data), len);
799         return d->m_decoder->toUnicode(data, len);
800     } else {
801         if (d->m_bufferForDefferedEncDetection.isEmpty()) {
802             // If encoding detection produced something, and we either got to the body or
803             // actually saw the encoding explicitly, we're done.
804             if (analyze(data, len) && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) {
805 #ifdef DECODE_DEBUG
806                 qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened first time " << d->m_codec->name();
807 #endif
808                 processNull(const_cast<char *>(data), len);
809                 d->m_writtingHappened = true;
810                 return d->m_decoder->toUnicode(data, len);
811             } else {
812 #ifdef DECODE_DEBUG
813                 qCWarning(KHTML_LOG) << "KEncodingDetector: begin deffer";
814 #endif
815                 d->m_bufferForDefferedEncDetection = data;
816             }
817         } else {
818             d->m_bufferForDefferedEncDetection += data;
819             // As above, but also limit the buffer size. We must use the entire buffer here,
820             // since the boundaries might split the meta tag, etc.
821             bool detected = analyze(d->m_bufferForDefferedEncDetection.constData(), d->m_bufferForDefferedEncDetection.length());
822             if ((detected && (d->m_seenBody || d->isExplicitlySpecifiedEncoding())) ||
823                     d->m_bufferForDefferedEncDetection.length() > MAX_BUFFER) {
824                 d->m_writtingHappened = true;
825                 d->m_bufferForDefferedEncDetection.replace('\0', ' ');
826                 QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
827                 d->m_bufferForDefferedEncDetection.clear();
828 #ifdef DECODE_DEBUG
829                 qCWarning(KHTML_LOG) << "KEncodingDetector: m_writtingHappened in the middle " << d->m_codec->name();
830 #endif
831                 return result;
832             }
833         }
834     }
835 
836     return QString();
837 }
838 
decodedInvalidCharacters() const839 bool KEncodingDetector::decodedInvalidCharacters() const
840 {
841     return d->m_decoder ? d->m_decoder->hasFailure() : false;
842 }
843 
flush()844 QString KEncodingDetector::flush()
845 {
846     if (d->m_bufferForDefferedEncDetection.isEmpty()) {
847         return QString();
848     }
849 
850     d->m_bufferForDefferedEncDetection.replace('\0', ' ');
851     QString result(d->m_decoder->toUnicode(d->m_bufferForDefferedEncDetection));
852     d->m_bufferForDefferedEncDetection.clear();
853 #ifdef DECODE_DEBUG
854     qCWarning(KHTML_LOG) << "KEncodingDetector:flush() " << d->m_bufferForDefferedEncDetection.length() << " bytes " << d->m_codec->name();
855 #endif
856     return result;
857 }
858 
analyze(const char * data,int len)859 bool KEncodingDetector::analyze(const char *data, int len)
860 {
861     // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
862     // maximumBOMLength = 10
863     // Even if the user has chosen utf16 we still need to auto-detect the endianness
864     if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) {
865         // Extract the first three bytes.
866         const uchar *udata = (const uchar *)data;
867         uchar c1 = *udata++;
868         uchar c2 = *udata++;
869         uchar c3 = *udata++;
870 
871         // Check for the BOM
872         const char *autoDetectedEncoding;
873         if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
874             autoDetectedEncoding = "UTF-16";
875         } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
876             autoDetectedEncoding = "UTF-8";
877         } else if (c1 == 0x00 || c2 == 0x00) {
878             uchar c4 = *udata++;
879             uchar c5 = *udata++;
880             uchar c6 = *udata++;
881             uchar c7 = *udata++;
882             uchar c8 = *udata++;
883             uchar c9 = *udata++;
884             uchar c10 = *udata++;
885 
886             int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
887             int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
888             if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) {
889                 autoDetectedEncoding = "UTF-16";
890             } else {
891                 autoDetectedEncoding = nullptr;
892             }
893         } else {
894             autoDetectedEncoding = nullptr;
895         }
896 
897         // If we found a BOM, use the encoding it implies.
898         if (autoDetectedEncoding != nullptr) {
899             d->m_source = BOM;
900             d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
901             assert(d->m_codec);
902             //enc = d->m_codec->name();
903             delete d->m_decoder;
904             d->m_decoder = d->m_codec->makeDecoder();
905 #ifdef DECODE_DEBUG
906             qCWarning(KHTML_LOG) << "Detection by BOM";
907 #endif
908             if (is16Bit(d->m_codec) && c2 == 0x00) {
909                 // utf16LE, we need to put the decoder in LE mode
910                 char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
911                 d->m_decoder->toUnicode(reverseUtf16, 2);
912             }
913             return true;
914         }
915     }
916 
917     //exit from routine in case it was called to only detect byte order for utf-16
918     if (d->m_source == UserChosenEncoding) {
919 #ifdef DECODE_DEBUG
920         qCWarning(KHTML_LOG) << "KEncodingDetector: UserChosenEncoding exit ";
921 #endif
922 
923         if (errorsIfUtf8(data, len)) {
924             setEncoding("", DefaultEncoding);
925         }
926         return true;
927     }
928 
929     // HTTP header takes precedence over meta-type stuff
930     if (d->m_source == EncodingFromHTTPHeader) {
931         return true;
932     }
933 
934     if (!d->m_seenBody) {
935         // we still don't have an encoding, and are in the head
936         // the following tags are allowed in <head>:
937         // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
938         const char *ptr = data;
939         const char *pEnd = data + len;
940 
941         while (ptr != pEnd) {
942             if (*ptr != '<') {
943                 ++ptr;
944                 continue;
945             }
946             ++ptr;
947             // Handle comments.
948             if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
949                 ptr += 3;
950                 skipComment(ptr, pEnd);
951                 continue;
952             }
953 
954             // Handle XML header, which can have encoding in it.
955             if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
956                 const char *end = ptr;
957                 while (*end != '>' && end < pEnd) {
958                     end++;
959                 }
960                 if (*end == '\0' || end == pEnd) {
961                     break;
962                 }
963                 QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator
964                 int length;
965                 int pos = findXMLEncoding(str, length);
966                 // also handles the case when specified encoding aint correct
967                 if (pos != -1 && setEncoding(str.mid(pos, length).data(), EncodingFromXMLHeader)) {
968                     return true;
969                 }
970             }
971 
972             //look for <meta>, stop if we reach <body>
973             while (
974                 !(((*ptr >= 'a') && (*ptr <= 'z')) ||
975                   ((*ptr >= 'A') && (*ptr <= 'Z')))
976                 && ptr < pEnd
977             ) {
978                 ++ptr;
979             }
980 
981             char tmp[5];
982             int length = 0;
983             const char *max = ptr + 4;
984             if (pEnd < max) {
985                 max = pEnd;
986             }
987             while (
988                 (((*ptr >= 'a') && (*ptr <= 'z')) ||
989                  ((*ptr >= 'A') && (*ptr <= 'Z')) ||
990                  ((*ptr >= '0') && (*ptr <= '9')))
991                 && ptr < max
992             ) {
993                 tmp[length] = tolower(*ptr);
994                 ++ptr;
995                 ++length;
996             }
997             tmp[length] = 0;
998             if (tmp[0] == 'm' && tmp[1] == 'e' && tmp[2] == 't' && tmp[3] == 'a') {
999                 // found a meta tag...
1000                 const char *end = ptr;
1001                 while (*end != '>' && *end != '\0' && end < pEnd) {
1002                     end++;
1003                 }
1004                 //if ( *end == '\0' ) break;
1005                 const QByteArray str = QByteArray(ptr, (end - ptr) + 1).toLower();
1006                 const int strLength = str.length();
1007                 int pos = 0;
1008                 //if( (pos = str.find("http-equiv", pos)) == -1) break;
1009                 //if( (pos = str.find("content-type", pos)) == -1) break;
1010                 if ((pos = str.indexOf("charset")) == -1) {
1011                     continue;
1012                 }
1013                 pos += 6;
1014                 // skip to '='
1015                 if ((pos = str.indexOf("=", pos)) == -1) {
1016                     continue;
1017                 }
1018 
1019                 // skip '='
1020                 ++pos;
1021 
1022                 // skip whitespace before encoding itself
1023                 while (pos < strLength && str[pos] <= ' ') {
1024                     ++pos;
1025                 }
1026 
1027                 // there may also be an opening quote, if this is a charset= and not a http-equiv.
1028                 if (pos < strLength && (str[pos] == '"' || str[pos] == '\'')) {
1029                     ++pos;
1030                 }
1031 
1032                 // skip whitespace
1033                 while (pos < strLength && str[pos] <= ' ') {
1034                     ++pos;
1035                 }
1036 
1037                 if (pos == strLength) {
1038                     continue;
1039                 }
1040 
1041                 int endpos = pos;
1042                 while (endpos < strLength &&
1043                         (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
1044                          && str[endpos] != ';' && str[endpos] != '>')) {
1045                     ++endpos;
1046                 }
1047 #ifdef DECODE_DEBUG
1048                 qCDebug(KHTML_LOG) << "KEncodingDetector: found charset in <meta>: " << str.mid(pos, endpos - pos).data();
1049 #endif
1050                 if (setEncoding(str.mid(pos, endpos - pos).data(), EncodingFromMetaTag)) {
1051                     return true;
1052                 }
1053             } else if (tmp[0] == 'b' && tmp[1] == 'o' && tmp[2] == 'd' && tmp[3] == 'y') {
1054                 d->m_seenBody = true;
1055                 break;
1056             }
1057         }
1058     }
1059 
1060     if (len < 20) {
1061         return false;
1062     }
1063 
1064 #ifdef DECODE_DEBUG
1065     qCDebug(KHTML_LOG) << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
1066 #endif
1067 
1068     switch (d->m_autoDetectLanguage) {
1069     case KEncodingDetector::Arabic:
1070         return setEncoding(automaticDetectionForArabic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1071 //             break;
1072     case KEncodingDetector::Baltic:
1073         return setEncoding(automaticDetectionForBaltic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1074 //             break;
1075     case KEncodingDetector::CentralEuropean:
1076         return setEncoding(automaticDetectionForCentralEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1077 //            break;
1078     case KEncodingDetector::Cyrillic:
1079         return setEncoding(automaticDetectionForCyrillic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1080 //             break;
1081     case KEncodingDetector::Greek:
1082         return setEncoding(automaticDetectionForGreek((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1083 //             break;
1084     case KEncodingDetector::Hebrew:
1085         return setEncoding(automaticDetectionForHebrew((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1086 //             break;
1087     case KEncodingDetector::Japanese:
1088         return setEncoding(automaticDetectionForJapanese((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1089 //             break;
1090     case KEncodingDetector::Turkish:
1091         return setEncoding(automaticDetectionForTurkish((const unsigned char *) data, len).data(), AutoDetectedEncoding);
1092 //             break;
1093     case KEncodingDetector::WesternEuropean:
1094         if (setEncoding(automaticDetectionForWesternEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding)) {
1095             return true;
1096         } else if (d->m_defaultCodec->mibEnum() == MibLatin1) { //detection for khtml
1097             return setEncoding("iso-8859-15", AutoDetectedEncoding);
1098         } else { //use default provided by eg katepart
1099             return setEncoding("", DefaultEncoding);
1100         }
1101 //             break;
1102     case KEncodingDetector::SemiautomaticDetection:
1103     case KEncodingDetector::ChineseSimplified:
1104     case KEncodingDetector::ChineseTraditional:
1105     case KEncodingDetector::Korean:
1106     case KEncodingDetector::Thai:
1107     case KEncodingDetector::Unicode:
1108     case KEncodingDetector::NorthernSaami:
1109     case KEncodingDetector::SouthEasternEurope:
1110     case KEncodingDetector::None:
1111         // huh. somethings broken in this code ### FIXME
1112         //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
1113         break;
1114     }
1115 
1116     return true;
1117 }
1118 
scriptForName(const QString & lang)1119 KEncodingDetector::AutoDetectScript KEncodingDetector::scriptForName(const QString &lang)
1120 {
1121     if (lang.isEmpty()) {
1122         return KEncodingDetector::None;
1123     } else if (lang == i18nc("@item Text character set", "Unicode")) {
1124         return KEncodingDetector::Unicode;
1125     } else if (lang == i18nc("@item Text character set", "Cyrillic")) {
1126         return KEncodingDetector::Cyrillic;
1127     } else if (lang == i18nc("@item Text character set", "Western European")) {
1128         return KEncodingDetector::WesternEuropean;
1129     } else if (lang == i18nc("@item Text character set", "Central European")) {
1130         return KEncodingDetector::CentralEuropean;
1131     } else if (lang == i18nc("@item Text character set", "Greek")) {
1132         return KEncodingDetector::Greek;
1133     } else if (lang == i18nc("@item Text character set", "Hebrew")) {
1134         return KEncodingDetector::Hebrew;
1135     } else if (lang == i18nc("@item Text character set", "Turkish")) {
1136         return KEncodingDetector::Turkish;
1137     } else if (lang == i18nc("@item Text character set", "Japanese")) {
1138         return KEncodingDetector::Japanese;
1139     } else if (lang == i18nc("@item Text character set", "Baltic")) {
1140         return KEncodingDetector::Baltic;
1141     } else if (lang == i18nc("@item Text character set", "Arabic")) {
1142         return KEncodingDetector::Arabic;
1143     }
1144 
1145     return KEncodingDetector::None;
1146 }
1147 
hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)1148 bool KEncodingDetector::hasAutoDetectionForScript(KEncodingDetector::AutoDetectScript script)
1149 {
1150     switch (script) {
1151     case KEncodingDetector::Arabic:
1152         return true;
1153     case KEncodingDetector::Baltic:
1154         return true;
1155     case KEncodingDetector::CentralEuropean:
1156         return true;
1157     case KEncodingDetector::Cyrillic:
1158         return true;
1159     case KEncodingDetector::Greek:
1160         return true;
1161     case KEncodingDetector::Hebrew:
1162         return true;
1163     case KEncodingDetector::Japanese:
1164         return true;
1165     case KEncodingDetector::Turkish:
1166         return true;
1167     case KEncodingDetector::WesternEuropean:
1168         return true;
1169     case KEncodingDetector::ChineseTraditional:
1170         return true;
1171     case KEncodingDetector::ChineseSimplified:
1172         return true;
1173     case KEncodingDetector::Unicode:
1174         return true;
1175         break;
1176     default:
1177         return false;
1178     }
1179 }
1180 
nameForScript(KEncodingDetector::AutoDetectScript script)1181 QString KEncodingDetector::nameForScript(KEncodingDetector::AutoDetectScript script)
1182 {
1183     switch (script) {
1184     case KEncodingDetector::Arabic:
1185         return i18nc("@item Text character set", "Arabic");
1186         break;
1187     case KEncodingDetector::Baltic:
1188         return i18nc("@item Text character set", "Baltic");
1189         break;
1190     case KEncodingDetector::CentralEuropean:
1191         return i18nc("@item Text character set", "Central European");
1192         break;
1193     case KEncodingDetector::Cyrillic:
1194         return i18nc("@item Text character set", "Cyrillic");
1195         break;
1196     case KEncodingDetector::Greek:
1197         return i18nc("@item Text character set", "Greek");
1198         break;
1199     case KEncodingDetector::Hebrew:
1200         return i18nc("@item Text character set", "Hebrew");
1201         break;
1202     case KEncodingDetector::Japanese:
1203         return i18nc("@item Text character set", "Japanese");
1204         break;
1205     case KEncodingDetector::Turkish:
1206         return i18nc("@item Text character set", "Turkish");
1207         break;
1208     case KEncodingDetector::WesternEuropean:
1209         return i18nc("@item Text character set", "Western European");
1210         break;
1211     case KEncodingDetector::ChineseTraditional:
1212         return i18nc("@item Text character set", "Chinese Traditional");
1213         break;
1214     case KEncodingDetector::ChineseSimplified:
1215         return i18nc("@item Text character set", "Chinese Simplified");
1216         break;
1217     case KEncodingDetector::Korean:
1218         return i18nc("@item Text character set", "Korean");
1219         break;
1220     case KEncodingDetector::Thai:
1221         return i18nc("@item Text character set", "Thai");
1222         break;
1223     case KEncodingDetector::Unicode:
1224         return i18nc("@item Text character set", "Unicode");
1225         break;
1226     //case KEncodingDetector::SemiautomaticDetection:
1227     default:
1228         return QString();
1229 
1230     }
1231 }
1232 
1233 #undef DECODE_DEBUG
1234 
1235