1 /************************************************************************
2 **
3 ** Copyright (C) 2016-2019 Kevin B. Hendricks Stratford, Ontario, Canada
4 ** Copyright (C) 2013 John Schember <john@nachtimwald.com>
5 ** Copyright (C) 2009-2011 Strahinja Markovic <strahinja.markovic@gmail.com>
6 **
7 ** This file is part of Sigil.
8 **
9 ** Sigil is free software: you can redistribute it and/or modify
10 ** it under the terms of the GNU General Public License as published by
11 ** the Free Software Foundation, either version 3 of the License, or
12 ** (at your option) any later version.
13 **
14 ** Sigil is distributed in the hope that it will be useful,
15 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ** GNU General Public License for more details.
18 **
19 ** You should have received a copy of the GNU General Public License
20 ** along with Sigil. If not, see <http://www.gnu.org/licenses/>.
21 **
22 *************************************************************************/
23
24 #include <string>
25
26 #include <QtCore/QFile>
27 #include <QtCore/QString>
28 #include <QtCore/QTextCodec>
29 #include <QRegularExpression>
30
31 #include "Misc/HTMLEncodingResolver.h"
32 #include "Misc/Utility.h"
33 #include "sigil_constants.h"
34 #include "sigil_exception.h"
35
36
37 const QString ENCODING_ATTRIBUTE = "encoding\\s*=\\s*(?:\"|')([^\"']+)(?:\"|')";
38 const QString CHARSET_ATTRIBUTE = "charset\\s*=\\s*(?:\"|')([^\"']+)(?:\"|')";
39 const QString STANDALONE_ATTRIBUTE = "standalone\\s*=\\s*(?:\"|')([^\"']+)(?:\"|')";
40 const QString VERSION_ATTRIBUTE = "<\\?xml[^>]*version\\s*=\\s*(?:\"|')([^\"']+)(?:\"|')[^>]*>";
41
42
43 // Accepts a full path to an HTML file.
44 // Reads the file, detects the encoding
45 // and returns the text converted to Unicode.
ReadHTMLFile(const QString & fullfilepath)46 QString HTMLEncodingResolver::ReadHTMLFile(const QString &fullfilepath)
47 {
48 QFile file(fullfilepath);
49
50 // Check if we can open the file
51 if (!file.open(QFile::ReadOnly)) {
52 std::string msg = file.fileName().toStdString() + ": " + file.errorString().toStdString();
53 throw (CannotOpenFile(msg));
54 }
55
56 QByteArray data = file.readAll();
57
58 return Utility::ConvertLineEndings(GetCodecForHTML(data)->toUnicode(data));
59 }
60
61
62 // Accepts an HTML stream and tries to determine its encoding;
63 // if no encoding is detected, the default codec for this locale is returned.
64 // We use this function because Qt's QTextCodec::codecForHtml() function
65 // leaves a *lot* to be desired.
GetCodecForHTML(const QByteArray & raw_text)66 const QTextCodec *HTMLEncodingResolver::GetCodecForHTML(const QByteArray &raw_text)
67 {
68 unsigned char c1;
69 unsigned char c2;
70 unsigned char c3;
71 unsigned char c4;
72 QString text;
73 QTextCodec *codec;
74
75 if (raw_text.count() < 4) {
76 return QTextCodec::codecForName("UTF-8");
77 }
78
79 // Check the BOM if present.
80 c1 = raw_text.at(0);
81 c2 = raw_text.at(1);
82 c3 = raw_text.at(2);
83 c4 = raw_text.at(3);
84 if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
85 return QTextCodec::codecForName("UTF-8");
86 } else if (c1 == 0xFF && c2 == 0xFE && c3 == 0 && c4 == 0) {
87 return QTextCodec::codecForName("UTF-32LE");
88 } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
89 return QTextCodec::codecForName("UTF-32BE");
90 } else if (c1 == 0xFE && c2 == 0xFF) {
91 return QTextCodec::codecForName("UTF-16BE");
92 } else if (c1 == 0xFF && c2 == 0xFE) {
93 return QTextCodec::codecForName("UTF-16LE");
94 }
95
96 // Alternating char followed by 0 is typical of utf 16 le without BOM.
97 if (c1 != 0 && c2 == 0 && c3 != 0 && c4 == 0) {
98 return QTextCodec::codecForName("UTF-16LE");
99 }
100
101 // Try to find an ecoding specified in the file itself.
102 text = Utility::Substring(0, 1024, raw_text);
103
104 // Check if the xml encoding attribute is set.
105 QRegularExpression enc_re(ENCODING_ATTRIBUTE);
106 QRegularExpressionMatch enc_mo = enc_re.match(text);
107 if (enc_mo.hasMatch()) {
108 codec = QTextCodec::codecForName(enc_mo.captured(1).toLatin1().toUpper());
109 if (codec) {
110 return codec;
111 }
112 }
113
114 // Check if the charset is set in the head.
115 QRegularExpression char_re(CHARSET_ATTRIBUTE);
116 QRegularExpressionMatch char_mo = char_re.match(text);
117 if (char_mo.hasMatch()) {
118 codec = QTextCodec::codecForName(char_mo.captured(1).toLatin1().toUpper());
119 if (codec) {
120 return codec;
121 }
122 }
123
124 // See if all characters within this document are utf-8.
125 if (IsValidUtf8(raw_text)) {
126 return QTextCodec::codecForName("UTF-8");
127 }
128
129 // Finally, let Qt guess and if it doesn't know it will return the codec
130 // for the current locale.
131 text = raw_text;
132 return QTextCodec::codecForHtml(raw_text, QTextCodec::codecForLocale());
133 }
134
135
136 // This function goes through the entire byte array
137 // and tries to see whether this is a valid UTF-8 sequence.
138 // If it's valid, this is probably a UTF-8 string.
IsValidUtf8(const QByteArray & string)139 bool HTMLEncodingResolver::IsValidUtf8(const QByteArray &string)
140 {
141 // This is an implementation of the Perl code written here:
142 // http://www.w3.org/International/questions/qa-forms-utf-8
143 //
144 // Basically, UTF-8 has a very specific byte-pattern. This function
145 // checks if the sent byte-sequence conforms to this pattern.
146 // If it does, chances are *very* high that this is UTF-8.
147 //
148 // This function is written to be fast, not pretty.
149 if (string.isNull()) {
150 return false;
151 }
152
153 int index = 0;
154
155 while (index < string.size()) {
156 QByteArray dword = string.mid(index, 4);
157
158 if (dword.size() < 4) {
159 dword = dword.leftJustified(4, '\0');
160 }
161
162 const unsigned char *bytes = (const unsigned char *) dword.constData();
163
164 // ASCII
165 if (bytes[0] == 0x09 ||
166 bytes[0] == 0x0A ||
167 bytes[0] == 0x0D ||
168 (0x20 <= bytes[0] && bytes[0] <= 0x7E)
169 ) {
170 index += 1;
171 }
172 // non-overlong 2-byte
173 else if ((0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
174 (0x80 <= bytes[1] && bytes[1] <= 0xBF)
175 ) {
176 index += 2;
177 } else if ((bytes[0] == 0xE0 && // excluding overlongs
178 (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
179 (0x80 <= bytes[2] && bytes[2] <= 0xBF)) ||
180 (((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || // straight 3-byte
181 bytes[0] == 0xEE ||
182 bytes[0] == 0xEF) &&
183 (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
184 (0x80 <= bytes[2] && bytes[2] <= 0xBF)) ||
185 (bytes[0] == 0xED && // excluding surrogates
186 (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
187 (0x80 <= bytes[2] && bytes[2] <= 0xBF))
188 ) {
189 index += 3;
190 } else if ((bytes[0] == 0xF0 && // planes 1-3
191 (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
192 (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
193 (0x80 <= bytes[3] && bytes[3] <= 0xBF)) ||
194 ((0xF1 <= bytes[0] && bytes[0] <= 0xF3) && // planes 4-15
195 (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
196 (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
197 (0x80 <= bytes[3] && bytes[3] <= 0xBF)) ||
198 (bytes[0] == 0xF4 && // plane 16
199 (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
200 (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
201 (0x80 <= bytes[3] && bytes[3] <= 0xBF))
202 ) {
203 index += 4;
204 } else {
205 return false;
206 }
207 }
208
209 return true;
210 }
211