1 /************************************************************************
2 **
3 **  Copyright (C) 2016-2019 Kevin B. Hendricks Stratford, Ontario, Canada
4 **  Copyright (C) 2013      John Schember <john@nachtimwald.com>
5 **  Copyright (C) 2009-2011 Strahinja Markovic  <strahinja.markovic@gmail.com>
6 **
7 **  This file is part of Sigil.
8 **
9 **  Sigil is free software: you can redistribute it and/or modify
10 **  it under the terms of the GNU General Public License as published by
11 **  the Free Software Foundation, either version 3 of the License, or
12 **  (at your option) any later version.
13 **
14 **  Sigil is distributed in the hope that it will be useful,
15 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
16 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 **  GNU General Public License for more details.
18 **
19 **  You should have received a copy of the GNU General Public License
20 **  along with Sigil.  If not, see <http://www.gnu.org/licenses/>.
21 **
22 *************************************************************************/
23 
24 #include <string>
25 
26 #include <QtCore/QFile>
27 #include <QtCore/QString>
28 #include <QtCore/QTextCodec>
29 #include <QRegularExpression>
30 
31 #include "Misc/HTMLEncodingResolver.h"
32 #include "Misc/Utility.h"
33 #include "sigil_constants.h"
34 #include "sigil_exception.h"
35 
36 
37 const QString ENCODING_ATTRIBUTE   = "encoding\\s*=\\s*(?:\"|')([^\"']+)(?:\"|')";
38 const QString CHARSET_ATTRIBUTE    = "charset\\s*=\\s*(?:\"|')([^\"']+)(?:\"|')";
39 const QString STANDALONE_ATTRIBUTE = "standalone\\s*=\\s*(?:\"|')([^\"']+)(?:\"|')";
40 const QString VERSION_ATTRIBUTE    = "<\\?xml[^>]*version\\s*=\\s*(?:\"|')([^\"']+)(?:\"|')[^>]*>";
41 
42 
43 // Accepts a full path to an HTML file.
44 // Reads the file, detects the encoding
45 // and returns the text converted to Unicode.
ReadHTMLFile(const QString & fullfilepath)46 QString HTMLEncodingResolver::ReadHTMLFile(const QString &fullfilepath)
47 {
48     QFile file(fullfilepath);
49 
50     // Check if we can open the file
51     if (!file.open(QFile::ReadOnly)) {
52         std::string msg = file.fileName().toStdString() + ": " + file.errorString().toStdString();
53         throw (CannotOpenFile(msg));
54     }
55 
56     QByteArray data = file.readAll();
57 
58     return Utility::ConvertLineEndings(GetCodecForHTML(data)->toUnicode(data));
59 }
60 
61 
62 // Accepts an HTML stream and tries to determine its encoding;
63 // if no encoding is detected, the default codec for this locale is returned.
64 // We use this function because Qt's QTextCodec::codecForHtml() function
65 // leaves a *lot* to be desired.
GetCodecForHTML(const QByteArray & raw_text)66 const QTextCodec *HTMLEncodingResolver::GetCodecForHTML(const QByteArray &raw_text)
67 {
68     unsigned char c1;
69     unsigned char c2;
70     unsigned char c3;
71     unsigned char c4;
72     QString text;
73     QTextCodec *codec;
74 
75     if (raw_text.count() < 4) {
76         return QTextCodec::codecForName("UTF-8");
77     }
78 
79     // Check the BOM if present.
80     c1 = raw_text.at(0);
81     c2 = raw_text.at(1);
82     c3 = raw_text.at(2);
83     c4 = raw_text.at(3);
84     if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
85         return QTextCodec::codecForName("UTF-8");
86     } else if (c1 == 0xFF && c2 == 0xFE && c3 == 0 && c4 == 0) {
87         return QTextCodec::codecForName("UTF-32LE");
88     } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
89         return QTextCodec::codecForName("UTF-32BE");
90     } else if (c1 == 0xFE && c2 == 0xFF) {
91         return QTextCodec::codecForName("UTF-16BE");
92     } else if (c1 == 0xFF && c2 == 0xFE) {
93         return QTextCodec::codecForName("UTF-16LE");
94     }
95 
96     // Alternating char followed by 0 is typical of utf 16 le without BOM.
97     if (c1 != 0 && c2 == 0 && c3 != 0 && c4 == 0) {
98         return QTextCodec::codecForName("UTF-16LE");
99     }
100 
101     // Try to find an ecoding specified in the file itself.
102     text = Utility::Substring(0, 1024, raw_text);
103 
104     // Check if the xml encoding attribute is set.
105     QRegularExpression enc_re(ENCODING_ATTRIBUTE);
106     QRegularExpressionMatch enc_mo = enc_re.match(text);
107     if (enc_mo.hasMatch()) {
108         codec = QTextCodec::codecForName(enc_mo.captured(1).toLatin1().toUpper());
109         if (codec) {
110             return codec;
111         }
112     }
113 
114     // Check if the charset is set in the head.
115     QRegularExpression char_re(CHARSET_ATTRIBUTE);
116     QRegularExpressionMatch char_mo = char_re.match(text);
117     if (char_mo.hasMatch()) {
118         codec = QTextCodec::codecForName(char_mo.captured(1).toLatin1().toUpper());
119         if (codec) {
120             return codec;
121         }
122     }
123 
124     // See if all characters within this document are utf-8.
125     if (IsValidUtf8(raw_text)) {
126         return QTextCodec::codecForName("UTF-8");
127     }
128 
129     // Finally, let Qt guess and if it doesn't know it will return the codec
130     // for the current locale.
131     text = raw_text;
132     return QTextCodec::codecForHtml(raw_text, QTextCodec::codecForLocale());
133 }
134 
135 
136 // This function goes through the entire byte array
137 // and tries to see whether this is a valid UTF-8 sequence.
138 // If it's valid, this is probably a UTF-8 string.
IsValidUtf8(const QByteArray & string)139 bool HTMLEncodingResolver::IsValidUtf8(const QByteArray &string)
140 {
141     // This is an implementation of the Perl code written here:
142     //   http://www.w3.org/International/questions/qa-forms-utf-8
143     //
144     // Basically, UTF-8 has a very specific byte-pattern. This function
145     // checks if the sent byte-sequence conforms to this pattern.
146     // If it does, chances are *very* high that this is UTF-8.
147     //
148     // This function is written to be fast, not pretty.
149     if (string.isNull()) {
150         return false;
151     }
152 
153     int index = 0;
154 
155     while (index < string.size()) {
156         QByteArray dword = string.mid(index, 4);
157 
158         if (dword.size() < 4) {
159             dword = dword.leftJustified(4, '\0');
160         }
161 
162         const unsigned char *bytes = (const unsigned char *) dword.constData();
163 
164         // ASCII
165         if (bytes[0] == 0x09 ||
166             bytes[0] == 0x0A ||
167             bytes[0] == 0x0D ||
168             (0x20 <= bytes[0] && bytes[0] <= 0x7E)
169            ) {
170             index += 1;
171         }
172         // non-overlong 2-byte
173         else if ((0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
174                  (0x80 <= bytes[1] && bytes[1] <= 0xBF)
175                 ) {
176             index += 2;
177         } else if ((bytes[0] == 0xE0                         &&              // excluding overlongs
178                     (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
179                     (0x80 <= bytes[2] && bytes[2] <= 0xBF)) ||
180                    (((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||               // straight 3-byte
181                      bytes[0] == 0xEE                         ||
182                      bytes[0] == 0xEF) &&
183                     (0x80 <= bytes[1] && bytes[1] <= 0xBF)   &&
184                     (0x80 <= bytes[2] && bytes[2] <= 0xBF)) ||
185                    (bytes[0] == 0xED                         &&              // excluding surrogates
186                     (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
187                     (0x80 <= bytes[2] && bytes[2] <= 0xBF))
188                   ) {
189             index += 3;
190         } else if ((bytes[0] == 0xF0                         &&              // planes 1-3
191                     (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
192                     (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
193                     (0x80 <= bytes[3] && bytes[3] <= 0xBF)) ||
194                    ((0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&              // planes 4-15
195                     (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
196                     (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
197                     (0x80 <= bytes[3] && bytes[3] <= 0xBF)) ||
198                    (bytes[0] == 0xF4                         &&            // plane 16
199                     (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
200                     (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
201                     (0x80 <= bytes[3] && bytes[3] <= 0xBF))
202                   ) {
203             index += 4;
204         } else {
205             return false;
206         }
207     }
208 
209     return true;
210 }
211