1 /************************************************************************
2 **
3 **  Copyright (C) 2015-2021 Kevin B. Hendricks Stratford, ON, Canada
4 **  Copyright (C) 2009-2011 Strahinja Markovic  <strahinja.markovic@gmail.com>
5 **
6 **  This file is part of Sigil.
7 **
8 **  Sigil is free software: you can redistribute it and/or modify
9 **  it under the terms of the GNU General Public License as published by
10 **  the Free Software Foundation, either version 3 of the License, or
11 **  (at your option) any later version.
12 **
13 **  Sigil is distributed in the hope that it will be useful,
14 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
15 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 **  GNU General Public License for more details.
17 **
18 **  You should have received a copy of the GNU General Public License
19 **  along with Sigil.  If not, see <http://www.gnu.org/licenses/>.
20 **
21 *************************************************************************/
22 
23 #include "EmbedPython/EmbeddedPython.h"
24 
25 #include <QtCore/QString>
26 #include <QtCore/QStringList>
27 #include <QtCore/QWriteLocker>
28 #include <QtWidgets/QApplication>
29 #include <QtWidgets/QProgressDialog>
30 #include <QRegularExpression>
31 #include <QRegularExpressionMatch>
32 
33 #include "BookManipulation/CleanSource.h"
34 #include "BookManipulation/XhtmlDoc.h"
35 #include "Parsers/GumboInterface.h"
36 #include "Misc/SettingsStore.h"
37 #include "sigil_constants.h"
38 #include "sigil_exception.h"
39 #include "Misc/Utility.h"
40 #include <utility>
41 
42 static const QString HEAD_END = "</\\s*head\\s*>";
43 const QString SVG_NAMESPACE_PREFIX = "<\\s*[^>]*(xmlns\\s*:\\s*svg\\s*=\\s*(?:\"|')[^\"']+(?:\"|'))[^>]*>";
44 
45 static const QStringList NUMERIC_NBSP = QStringList() << "&#160;" << "&#xa0;" << "&#x00a0;";
46 
47 
48 // Performs general cleaning (and improving)
49 // of provided book XHTML source code
Mend(const QString & source,const QString & version)50 QString CleanSource::Mend(const QString &source, const QString &version)
51 {
52     SettingsStore settings;
53     QString newsource = PreprocessSpecialCases(source);
54     GumboInterface gp = GumboInterface(newsource, version);
55     newsource = gp.repair();
56     newsource = CharToEntity(newsource, version);
57     newsource = PrettifyDOCTYPEHeader(newsource);
58     return newsource;
59 }
60 
61 
62 // Mend and Prettify XHTML
MendPrettify(const QString & source,const QString & version)63 QString CleanSource::MendPrettify(const QString &source, const QString &version)
64 {
65     QString newsource = PreprocessSpecialCases(source);
66     GumboInterface gi = GumboInterface(newsource, version);
67     newsource = gi.prettyprint();
68     newsource = CharToEntity(newsource, version);
69     newsource = PrettifyDOCTYPEHeader(newsource);
70     return newsource;
71 }
72 
73 
74 // Repair XML if needed and PrettyPrint using BeautifulSoup4
XMLPrettyPrintBS4(const QString & source,const QString mtype)75 QString CleanSource::XMLPrettyPrintBS4(const QString &source, const QString mtype)
76 {
77     int rv = 0;
78     QString error_traceback;
79     QList<QVariant> args;
80     args.append(QVariant(source));
81     args.append(QVariant(mtype));
82     EmbeddedPython * epython  = EmbeddedPython::instance();
83 
84     QVariant res = epython->runInPython( QString("xmlprocessor"),
85                                          QString("repairXML"),
86                                          args,
87                                          &rv,
88                                          error_traceback);
89     if (rv != 0) {
90         Utility::DisplayStdWarningDialog(QString("error in xmlprocessor repairXML: ") + QString::number(rv),
91                                          error_traceback);
92         // an error happened, return unchanged original
93         return QString(source);
94     }
95     return res.toString();
96 }
97 
98 // convert the source to valid XHTML
ToValidXHTML(const QString & source,const QString & version)99 QString CleanSource::ToValidXHTML(const QString &source, const QString &version)
100 {
101     QString newsource = source;
102     if (!XhtmlDoc::IsDataWellFormed(source)) {
103         newsource = Mend(source, version);
104     }
105     return newsource;
106 }
107 
WellFormedXMLCheck(const QString & source,const QString mtype)108 XhtmlDoc::WellFormedError CleanSource::WellFormedXMLCheck(const QString &source, const QString mtype)
109 {
110     XhtmlDoc::WellFormedError error;
111     int rv = 0;
112     QString error_traceback;
113     QList<QVariant> args;
114     args.append(QVariant(source));
115     args.append(QVariant(mtype));
116     EmbeddedPython * epython  = EmbeddedPython::instance();
117 
118     QVariant res = epython->runInPython( QString("xmlprocessor"),
119                                          QString("WellFormedXMLCheck"),
120                                          args,
121                                          &rv,
122                                          error_traceback);
123     if (rv != 0) {
124         Utility::DisplayStdWarningDialog(QString("error in xmlprocessor WellFormedXMLCheck: ") + QString::number(rv),
125                                          error_traceback);
126         // an error happened during check, return well-formed as true
127         return error;
128     }
129     QStringList errors = res.toStringList();
130     error.line = errors.at(0).toInt();
131     error.column = errors.at(1).toInt();
132     error.message = errors.at(2);
133     return error;
134 }
135 
IsWellFormedXML(const QString & source,const QString mtype)136 bool CleanSource::IsWellFormedXML(const QString &source, const QString mtype)
137 {
138     int rv = 0;
139     QString error_traceback;
140     QList<QVariant> args;
141     args.append(QVariant(source));
142     args.append(QVariant(mtype));
143     EmbeddedPython * epython  = EmbeddedPython::instance();
144 
145     QVariant res = epython->runInPython( QString("xmlprocessor"),
146                                          QString("IsWellFormedXML"),
147                                          args,
148                                          &rv,
149                                          error_traceback);
150     if (rv != 0) {
151         Utility::DisplayStdWarningDialog(QString("error in xmlprocessor IsWellFormedXML: ") + QString::number(rv),
152                                          error_traceback);
153         // an error happened during check return well-formed as true
154         return true;
155     }
156     return res.toBool();
157 }
158 
ProcessXML(const QString & source,const QString mtype)159 QString CleanSource::ProcessXML(const QString &source, const QString mtype)
160 {
161     return XMLPrettyPrintBS4(source, mtype);
162 }
163 
RemoveMetaCharset(const QString & source)164 QString CleanSource::RemoveMetaCharset(const QString &source)
165 {
166     int head_end = source.indexOf(QRegularExpression(HEAD_END));
167     if (head_end == -1) {
168         return source;
169     }
170     QString head = Utility::Substring(0, head_end, source);
171 
172     QRegularExpression metacharset("<meta[^>]+charset[^>]+>");
173     QRegularExpressionMatch metacharset_match = metacharset.match(head);
174     if (!metacharset_match.hasMatch()) {
175         return source;
176     }
177     int meta_start = metacharset_match.capturedStart();
178 
179     head.remove(meta_start, metacharset_match.capturedLength());
180     return head + Utility::Substring(head_end, source.length(), source);
181 }
182 
183 
184 // neither svg nor math tags need a namespace prefix defined
185 // especially as epub3 now includes them into the html5 spec
186 // So we need to remove the svg prefix from the tags before
187 // processing them with gumbo
PreprocessSpecialCases(const QString & source)188 QString CleanSource::PreprocessSpecialCases(const QString &source)
189 {
190     QString newsource = source;
191     // remove prefix from root tag and add unprefixed svg namespace to it
192     QRegularExpression root_svg_tag_with_prefix("<\\s*svg\\s*:\\s*svg");
193     QString root_svg_embeddedNS = "<svg xmlns=\"http://www.w3.org/2000/svg\"";
194     newsource.replace(root_svg_tag_with_prefix, root_svg_embeddedNS);
195     // search for any prefixed svg namespace in that root tag and remove it
196     QRegularExpression svg_nsprefix(SVG_NAMESPACE_PREFIX);
197     QRegularExpressionMatch mo = svg_nsprefix.match(newsource);
198     if (mo.hasMatch()) {
199         newsource.replace(mo.capturedStart(1), mo.capturedLength(1), "");
200     }
201     // now strip the prefix from all child starting tags
202     QRegularExpression starting_child_svg_tag_with_prefix("<\\s*svg\\s*:");
203     QString starting_child_tag_no_prefix = "<";
204     newsource.replace(starting_child_svg_tag_with_prefix, starting_child_tag_no_prefix);
205     // do the same for any child ending tags
206     QRegularExpression ending_child_svg_tag_with_prefix("<\\s*/\\s*svg\\s*:");
207     QString ending_child_tag_no_prefix = "</";
208     newsource.replace(ending_child_svg_tag_with_prefix, ending_child_tag_no_prefix);
209     return newsource;
210 }
211 
212 
213 // Be careful to make sure that we do not mess up epub3 <!DOCTYPE html> here
PrettifyDOCTYPEHeader(const QString & source)214 QString CleanSource::PrettifyDOCTYPEHeader(const QString &source)
215 {
216     QString newsource = source;
217     const int SAFE_LENGTH = 200;
218     QRegularExpression doctype_invalid("<!DOCTYPE html PUBLIC \"W3C");
219     int index = newsource.indexOf(doctype_invalid);
220 
221     if (index > 0 && index < SAFE_LENGTH) {
222         newsource.insert(index + 23, "-//");
223     }
224 
225     QRegularExpression doctype_missing_newline("\\?><!DOCTYPE");
226     index = source.indexOf(doctype_missing_newline);
227 
228     if (index > 0 && index < SAFE_LENGTH) {
229         newsource.insert(index + 2, "\n");
230         QRegularExpression html_missing_newline("\"><html ");
231         index = newsource.indexOf(html_missing_newline);
232 
233         if (index > 0 && index < SAFE_LENGTH) {
234             newsource.insert(index + 2, "\n\n");
235         }
236 
237         bool is_ncx = false;
238         QRegularExpression ncx_missing_newline("\"><ncx ");
239         index = newsource.indexOf(ncx_missing_newline);
240 
241         if (index > 0 && index < SAFE_LENGTH) {
242             is_ncx = true;
243             newsource.insert(index + 2, "\n");
244         }
245 
246         QRegularExpression doctype_http_missing_newline("//EN\" \"http://");
247         index = newsource.indexOf(doctype_http_missing_newline);
248 
249         if (index > 0 && index < SAFE_LENGTH) {
250             newsource.insert(index + 5, is_ncx ? "\n" : "\n ");
251         }
252     }
253 
254     return newsource;
255 }
256 
257 
CharToEntity(const QString & source,const QString & version)258 QString CleanSource::CharToEntity(const QString &source, const QString &version)
259 {
260     SettingsStore settings;
261     QString new_source = source;
262     QList<std::pair <ushort, QString>> codenames = settings.preserveEntityCodeNames();
263     std::pair <ushort, QString> epair;
264     bool has_numeric_nbsp = false;
265     foreach(epair, codenames) {
266         QString codename = epair.second.toLower();
267         if (NUMERIC_NBSP.contains(codename)) {
268             has_numeric_nbsp = true;
269         }
270     }
271     // now intelligently handle the replacements
272     foreach(epair, codenames) {
273         QString codename = epair.second.toLower();
274         if (version.startsWith("2")) {
275             new_source.replace(QChar(epair.first), codename);
276         } else if (version.startsWith("3")) {
277             // only use numeric entities in epub3
278             if (codename.startsWith("&#")) {
279                 new_source.replace(QChar(epair.first), codename);
280             } else if ((codename == "&nbsp;") && !has_numeric_nbsp) {
281                 new_source.replace(QChar(epair.first), "&#160;");
282             }
283         }
284     }
285     return new_source;
286 }
287 
288 
ReformatAll(QList<HTMLResource * > resources,QString (clean_func)(const QString & source,const QString & version))289 bool CleanSource::ReformatAll(QList <HTMLResource *> resources, QString(clean_func)(const QString &source, const QString &version))
290 {
291     QProgressDialog progress(QObject::tr("Cleaning..."), 0, 0, resources.count(), Utility::GetMainWindow());
292     progress.setMinimumDuration(PROGRESS_BAR_MINIMUM_DURATION);
293     int progress_value = 0;
294     progress.setValue(progress_value);
295     bool book_modified = false;
296     foreach(HTMLResource * resource, resources) {
297         progress.setValue(progress_value++);
298         qApp->processEvents();
299         QWriteLocker locker(&resource->GetLock());
300         QString source = resource->GetText();
301         QString version = resource->GetEpubVersion();
302         QString newsource = clean_func(source, version);
303         if (newsource != source) {
304             book_modified = true;
305             resource->SetText(newsource);
306         }
307     }
308     return book_modified;
309 }
310