1 /************************************************************************
2 **
3 ** Copyright (C) 2015-2021 Kevin B. Hendricks Stratford, ON, Canada
4 ** Copyright (C) 2009-2011 Strahinja Markovic <strahinja.markovic@gmail.com>
5 **
6 ** This file is part of Sigil.
7 **
8 ** Sigil is free software: you can redistribute it and/or modify
9 ** it under the terms of the GNU General Public License as published by
10 ** the Free Software Foundation, either version 3 of the License, or
11 ** (at your option) any later version.
12 **
13 ** Sigil is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with Sigil. If not, see <http://www.gnu.org/licenses/>.
20 **
21 *************************************************************************/
22
23 #include "EmbedPython/EmbeddedPython.h"
24
25 #include <QtCore/QString>
26 #include <QtCore/QStringList>
27 #include <QtCore/QWriteLocker>
28 #include <QtWidgets/QApplication>
29 #include <QtWidgets/QProgressDialog>
30 #include <QRegularExpression>
31 #include <QRegularExpressionMatch>
32
33 #include "BookManipulation/CleanSource.h"
34 #include "BookManipulation/XhtmlDoc.h"
35 #include "Parsers/GumboInterface.h"
36 #include "Misc/SettingsStore.h"
37 #include "sigil_constants.h"
38 #include "sigil_exception.h"
39 #include "Misc/Utility.h"
40 #include <utility>
41
42 static const QString HEAD_END = "</\\s*head\\s*>";
43 const QString SVG_NAMESPACE_PREFIX = "<\\s*[^>]*(xmlns\\s*:\\s*svg\\s*=\\s*(?:\"|')[^\"']+(?:\"|'))[^>]*>";
44
45 static const QStringList NUMERIC_NBSP = QStringList() << " " << " " << " ";
46
47
48 // Performs general cleaning (and improving)
49 // of provided book XHTML source code
Mend(const QString & source,const QString & version)50 QString CleanSource::Mend(const QString &source, const QString &version)
51 {
52 SettingsStore settings;
53 QString newsource = PreprocessSpecialCases(source);
54 GumboInterface gp = GumboInterface(newsource, version);
55 newsource = gp.repair();
56 newsource = CharToEntity(newsource, version);
57 newsource = PrettifyDOCTYPEHeader(newsource);
58 return newsource;
59 }
60
61
62 // Mend and Prettify XHTML
MendPrettify(const QString & source,const QString & version)63 QString CleanSource::MendPrettify(const QString &source, const QString &version)
64 {
65 QString newsource = PreprocessSpecialCases(source);
66 GumboInterface gi = GumboInterface(newsource, version);
67 newsource = gi.prettyprint();
68 newsource = CharToEntity(newsource, version);
69 newsource = PrettifyDOCTYPEHeader(newsource);
70 return newsource;
71 }
72
73
74 // Repair XML if needed and PrettyPrint using BeautifulSoup4
XMLPrettyPrintBS4(const QString & source,const QString mtype)75 QString CleanSource::XMLPrettyPrintBS4(const QString &source, const QString mtype)
76 {
77 int rv = 0;
78 QString error_traceback;
79 QList<QVariant> args;
80 args.append(QVariant(source));
81 args.append(QVariant(mtype));
82 EmbeddedPython * epython = EmbeddedPython::instance();
83
84 QVariant res = epython->runInPython( QString("xmlprocessor"),
85 QString("repairXML"),
86 args,
87 &rv,
88 error_traceback);
89 if (rv != 0) {
90 Utility::DisplayStdWarningDialog(QString("error in xmlprocessor repairXML: ") + QString::number(rv),
91 error_traceback);
92 // an error happened, return unchanged original
93 return QString(source);
94 }
95 return res.toString();
96 }
97
98 // convert the source to valid XHTML
ToValidXHTML(const QString & source,const QString & version)99 QString CleanSource::ToValidXHTML(const QString &source, const QString &version)
100 {
101 QString newsource = source;
102 if (!XhtmlDoc::IsDataWellFormed(source)) {
103 newsource = Mend(source, version);
104 }
105 return newsource;
106 }
107
WellFormedXMLCheck(const QString & source,const QString mtype)108 XhtmlDoc::WellFormedError CleanSource::WellFormedXMLCheck(const QString &source, const QString mtype)
109 {
110 XhtmlDoc::WellFormedError error;
111 int rv = 0;
112 QString error_traceback;
113 QList<QVariant> args;
114 args.append(QVariant(source));
115 args.append(QVariant(mtype));
116 EmbeddedPython * epython = EmbeddedPython::instance();
117
118 QVariant res = epython->runInPython( QString("xmlprocessor"),
119 QString("WellFormedXMLCheck"),
120 args,
121 &rv,
122 error_traceback);
123 if (rv != 0) {
124 Utility::DisplayStdWarningDialog(QString("error in xmlprocessor WellFormedXMLCheck: ") + QString::number(rv),
125 error_traceback);
126 // an error happened during check, return well-formed as true
127 return error;
128 }
129 QStringList errors = res.toStringList();
130 error.line = errors.at(0).toInt();
131 error.column = errors.at(1).toInt();
132 error.message = errors.at(2);
133 return error;
134 }
135
IsWellFormedXML(const QString & source,const QString mtype)136 bool CleanSource::IsWellFormedXML(const QString &source, const QString mtype)
137 {
138 int rv = 0;
139 QString error_traceback;
140 QList<QVariant> args;
141 args.append(QVariant(source));
142 args.append(QVariant(mtype));
143 EmbeddedPython * epython = EmbeddedPython::instance();
144
145 QVariant res = epython->runInPython( QString("xmlprocessor"),
146 QString("IsWellFormedXML"),
147 args,
148 &rv,
149 error_traceback);
150 if (rv != 0) {
151 Utility::DisplayStdWarningDialog(QString("error in xmlprocessor IsWellFormedXML: ") + QString::number(rv),
152 error_traceback);
153 // an error happened during check return well-formed as true
154 return true;
155 }
156 return res.toBool();
157 }
158
ProcessXML(const QString & source,const QString mtype)159 QString CleanSource::ProcessXML(const QString &source, const QString mtype)
160 {
161 return XMLPrettyPrintBS4(source, mtype);
162 }
163
RemoveMetaCharset(const QString & source)164 QString CleanSource::RemoveMetaCharset(const QString &source)
165 {
166 int head_end = source.indexOf(QRegularExpression(HEAD_END));
167 if (head_end == -1) {
168 return source;
169 }
170 QString head = Utility::Substring(0, head_end, source);
171
172 QRegularExpression metacharset("<meta[^>]+charset[^>]+>");
173 QRegularExpressionMatch metacharset_match = metacharset.match(head);
174 if (!metacharset_match.hasMatch()) {
175 return source;
176 }
177 int meta_start = metacharset_match.capturedStart();
178
179 head.remove(meta_start, metacharset_match.capturedLength());
180 return head + Utility::Substring(head_end, source.length(), source);
181 }
182
183
184 // neither svg nor math tags need a namespace prefix defined
185 // especially as epub3 now includes them into the html5 spec
186 // So we need to remove the svg prefix from the tags before
187 // processing them with gumbo
PreprocessSpecialCases(const QString & source)188 QString CleanSource::PreprocessSpecialCases(const QString &source)
189 {
190 QString newsource = source;
191 // remove prefix from root tag and add unprefixed svg namespace to it
192 QRegularExpression root_svg_tag_with_prefix("<\\s*svg\\s*:\\s*svg");
193 QString root_svg_embeddedNS = "<svg xmlns=\"http://www.w3.org/2000/svg\"";
194 newsource.replace(root_svg_tag_with_prefix, root_svg_embeddedNS);
195 // search for any prefixed svg namespace in that root tag and remove it
196 QRegularExpression svg_nsprefix(SVG_NAMESPACE_PREFIX);
197 QRegularExpressionMatch mo = svg_nsprefix.match(newsource);
198 if (mo.hasMatch()) {
199 newsource.replace(mo.capturedStart(1), mo.capturedLength(1), "");
200 }
201 // now strip the prefix from all child starting tags
202 QRegularExpression starting_child_svg_tag_with_prefix("<\\s*svg\\s*:");
203 QString starting_child_tag_no_prefix = "<";
204 newsource.replace(starting_child_svg_tag_with_prefix, starting_child_tag_no_prefix);
205 // do the same for any child ending tags
206 QRegularExpression ending_child_svg_tag_with_prefix("<\\s*/\\s*svg\\s*:");
207 QString ending_child_tag_no_prefix = "</";
208 newsource.replace(ending_child_svg_tag_with_prefix, ending_child_tag_no_prefix);
209 return newsource;
210 }
211
212
213 // Be careful to make sure that we do not mess up epub3 <!DOCTYPE html> here
PrettifyDOCTYPEHeader(const QString & source)214 QString CleanSource::PrettifyDOCTYPEHeader(const QString &source)
215 {
216 QString newsource = source;
217 const int SAFE_LENGTH = 200;
218 QRegularExpression doctype_invalid("<!DOCTYPE html PUBLIC \"W3C");
219 int index = newsource.indexOf(doctype_invalid);
220
221 if (index > 0 && index < SAFE_LENGTH) {
222 newsource.insert(index + 23, "-//");
223 }
224
225 QRegularExpression doctype_missing_newline("\\?><!DOCTYPE");
226 index = source.indexOf(doctype_missing_newline);
227
228 if (index > 0 && index < SAFE_LENGTH) {
229 newsource.insert(index + 2, "\n");
230 QRegularExpression html_missing_newline("\"><html ");
231 index = newsource.indexOf(html_missing_newline);
232
233 if (index > 0 && index < SAFE_LENGTH) {
234 newsource.insert(index + 2, "\n\n");
235 }
236
237 bool is_ncx = false;
238 QRegularExpression ncx_missing_newline("\"><ncx ");
239 index = newsource.indexOf(ncx_missing_newline);
240
241 if (index > 0 && index < SAFE_LENGTH) {
242 is_ncx = true;
243 newsource.insert(index + 2, "\n");
244 }
245
246 QRegularExpression doctype_http_missing_newline("//EN\" \"http://");
247 index = newsource.indexOf(doctype_http_missing_newline);
248
249 if (index > 0 && index < SAFE_LENGTH) {
250 newsource.insert(index + 5, is_ncx ? "\n" : "\n ");
251 }
252 }
253
254 return newsource;
255 }
256
257
CharToEntity(const QString & source,const QString & version)258 QString CleanSource::CharToEntity(const QString &source, const QString &version)
259 {
260 SettingsStore settings;
261 QString new_source = source;
262 QList<std::pair <ushort, QString>> codenames = settings.preserveEntityCodeNames();
263 std::pair <ushort, QString> epair;
264 bool has_numeric_nbsp = false;
265 foreach(epair, codenames) {
266 QString codename = epair.second.toLower();
267 if (NUMERIC_NBSP.contains(codename)) {
268 has_numeric_nbsp = true;
269 }
270 }
271 // now intelligently handle the replacements
272 foreach(epair, codenames) {
273 QString codename = epair.second.toLower();
274 if (version.startsWith("2")) {
275 new_source.replace(QChar(epair.first), codename);
276 } else if (version.startsWith("3")) {
277 // only use numeric entities in epub3
278 if (codename.startsWith("&#")) {
279 new_source.replace(QChar(epair.first), codename);
280 } else if ((codename == " ") && !has_numeric_nbsp) {
281 new_source.replace(QChar(epair.first), " ");
282 }
283 }
284 }
285 return new_source;
286 }
287
288
ReformatAll(QList<HTMLResource * > resources,QString (clean_func)(const QString & source,const QString & version))289 bool CleanSource::ReformatAll(QList <HTMLResource *> resources, QString(clean_func)(const QString &source, const QString &version))
290 {
291 QProgressDialog progress(QObject::tr("Cleaning..."), 0, 0, resources.count(), Utility::GetMainWindow());
292 progress.setMinimumDuration(PROGRESS_BAR_MINIMUM_DURATION);
293 int progress_value = 0;
294 progress.setValue(progress_value);
295 bool book_modified = false;
296 foreach(HTMLResource * resource, resources) {
297 progress.setValue(progress_value++);
298 qApp->processEvents();
299 QWriteLocker locker(&resource->GetLock());
300 QString source = resource->GetText();
301 QString version = resource->GetEpubVersion();
302 QString newsource = clean_func(source, version);
303 if (newsource != source) {
304 book_modified = true;
305 resource->SetText(newsource);
306 }
307 }
308 return book_modified;
309 }
310