1 /************************************************************************
2 **
3 **  Copyright (C) 2016-2021 Kevin B. Hendricks, Stratford, Ontario, Canada
4 **  Copyright (C) 2012      John Schember <john@nachtimwald.com>
5 **  Copyright (C) 2009-2011 Strahinja Markovic  <strahinja.markovic@gmail.com>
6 **
7 **  This file is part of Sigil.
8 **
9 **  Sigil is free software: you can redistribute it and/or modify
10 **  it under the terms of the GNU General Public License as published by
11 **  the Free Software Foundation, either version 3 of the License, or
12 **  (at your option) any later version.
13 **
14 **  Sigil is distributed in the hope that it will be useful,
15 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
16 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 **  GNU General Public License for more details.
18 **
19 **  You should have received a copy of the GNU General Public License
20 **  along with Sigil.  If not, see <http://www.gnu.org/licenses/>.
21 **
22 *************************************************************************/
23 
24 #ifdef _WIN32
25 #define NOMINMAX
26 #endif
27 
28 #include "unzip.h"
29 #ifdef _WIN32
30 #include "iowin32.h"
31 #endif
32 
33 #include <string>
34 
35 #include <QApplication>
36 #include <QtCore/QtCore>
37 #include <QtCore/QDir>
38 #include <QtCore/QFile>
39 #include <QtCore/QFileInfo>
40 #include <QtCore/QFutureSynchronizer>
41 #include <QtConcurrent/QtConcurrent>
42 #include <QtCore/QXmlStreamReader>
43 #include <QDirIterator>
44 #include <QRegularExpression>
45 #include <QRegularExpressionMatch>
46 #include <QStringList>
47 #include <QMessageBox>
48 #include <QUrl>
49 #include <QDebug>
50 
51 #include "BookManipulation/FolderKeeper.h"
52 #include "BookManipulation/CleanSource.h"
53 #include "Importers/ImportEPUB.h"
54 #include "Misc/MediaTypes.h"
55 #include "Misc/FontObfuscation.h"
56 #include "Misc/HTMLEncodingResolver.h"
57 #include "Misc/QCodePage437Codec.h"
58 #include "Misc/SettingsStore.h"
59 #include "Misc/Utility.h"
60 #include "ResourceObjects/CSSResource.h"
61 #include "ResourceObjects/HTMLResource.h"
62 #include "ResourceObjects/OPFResource.h"
63 #include "ResourceObjects/NCXResource.h"
64 #include "ResourceObjects/Resource.h"
65 #include "ResourceObjects/OPFParser.h"
66 #include "sigil_constants.h"
67 #include "sigil_exception.h"
68 
69 #ifndef MAX_PATH
70 // Set Max length to 256 because that's the max path size on many systems.
71 #define MAX_PATH 256
72 #endif
73 // This is the same read buffer size used by Java and Perl.
74 #define BUFF_SIZE 8192
75 
76 const QString DUBLIN_CORE_NS             = "http://purl.org/dc/elements/1.1/";
77 static const QString OEBPS_MIMETYPE      = "application/oebps-package+xml";
78 static const QString UPDATE_ERROR_STRING = "SG_ERROR";
79 const QString NCX_MIMETYPE               = "application/x-dtbncx+xml";
80 static const QString NCX_EXTENSION       = "ncx";
81 const QString ADOBE_FONT_ALGO_ID         = "http://ns.adobe.com/pdf/enc#RC";
82 const QString IDPF_FONT_ALGO_ID          = "http://www.idpf.org/2008/embedding";
83 static const QString CONTAINER_XML       = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
84         "<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\n"
85         "    <rootfiles>\n"
86         "        <rootfile full-path=\"%1\" media-type=\"application/oebps-package+xml\"/>\n"
87         "   </rootfiles>\n"
88         "</container>\n";
89 
90 static QCodePage437Codec *cp437 = 0;
91 
92 // Constructor;
93 // The parameter is the file to be imported
ImportEPUB(const QString & fullfilepath)94 ImportEPUB::ImportEPUB(const QString &fullfilepath)
95     : Importer(fullfilepath),
96       m_ExtractedFolderPath(m_TempFolder.GetPath()),
97       m_HasSpineItems(false),
98       m_NCXNotInManifest(false),
99       m_NavResource(NULL)
100 {
101 }
102 
103 // Reads and parses the file
104 // and returns the created Book
GetBook(bool extract_metadata)105 QSharedPointer<Book> ImportEPUB::GetBook(bool extract_metadata)
106 {
107     QList<HTMLResource *> non_well_formed;
108     SettingsStore ss;
109 
110     if (!Utility::IsFileReadable(m_FullFilePath)) {
111         throw (EPUBLoadParseError(QString(QObject::tr("Cannot read EPUB: %1")).arg(QDir::toNativeSeparators(m_FullFilePath)).toStdString()));
112     }
113 
114     // These read the EPUB file
115     ExtractContainer();
116     QHash<QString, QString> encrypted_files = ParseEncryptionXml();
117 
118     if (BookContentEncrypted(encrypted_files)) {
119         throw (FileEncryptedWithDrm(""));
120     }
121 
122     QApplication::setOverrideCursor(Qt::WaitCursor);
123 
124     LocateOPF();
125     m_opfDir = QFileInfo(m_OPFFilePath).dir();
126     // These mutate the m_Book object
127     ReadOPF();
128     AddObfuscatedButUndeclaredFonts(encrypted_files);
129     AddNonStandardAppleXML();
130 
131     m_Book->GetFolderKeeper()->SetGroupFolders(m_ManifestFilePaths, m_ManifestMediaTypes);
132 
133     LoadInfrastructureFiles();
134 
135     // Check for files missing in the Manifest and create warning
136     QStringList notInManifest;
137     foreach(QString file_path, m_ZipFilePaths) {
138         // skip mimetype and anything in META-INF and the opf itself
139         if (file_path == "mimetype") continue;
140         if (file_path.startsWith("META-INF")) continue;
141         if (m_OPFFilePath.contains(file_path)) continue;
142         if (!m_ManifestFilePaths.contains(file_path)) {
143             notInManifest << file_path;
144         }
145     }
146 
147     if (!notInManifest.isEmpty()) {
148         QApplication::restoreOverrideCursor();
149         Utility::DisplayStdWarningDialog(tr("Files exist in epub that are not listed in the manifest, they will be ignored"), notInManifest.join("\n"));
150         QApplication::setOverrideCursor(Qt::WaitCursor);
151     }
152 
153     LoadFolderStructure();
154 
155     const QList<Resource *> resources = m_Book->GetFolderKeeper()->GetResourceList();
156 
157     // We're going to check all html files until we find one that isn't well formed then we'll prompt
158     // the user if they want to auto fix or not.
159     //
160     // If we have non-well formed content and they shouldn't be auto fixed we'll pass that on to
161     // the universal update function so it knows to skip them. Otherwise we won't include them and
162     // let it modify the file.
163     for (int i=0; i<resources.count(); ++i) {
164         if (resources.at(i)->Type() == Resource::HTMLResourceType) {
165             HTMLResource *hresource = qobject_cast<HTMLResource *>(resources.at(i));
166             if (!hresource) {
167                 continue;
168             }
169             // Load the content into the HTMLResource so we can perform a well formed check.
170             try {
171                 hresource->SetText(HTMLEncodingResolver::ReadHTMLFile(hresource->GetFullPath()));
172             } catch (...) {
173                 if (ss.cleanOn() & CLEANON_OPEN) {
174                     non_well_formed << hresource;
175                     continue;
176                 }
177             }
178             if (ss.cleanOn() & CLEANON_OPEN) {
179                 if (!XhtmlDoc::IsDataWellFormed(hresource->GetText(),hresource->GetEpubVersion())) {
180                     non_well_formed << hresource;
181                 } else {
182                     QString txt = hresource->GetText();
183                     // had cases of large files with no line breaks
184                     if (txt.size() > 307200) {
185                         int lines = 0;
186                         QChar *uc = txt.data();
187                         QChar *e = uc + txt.size();
188                         for (; uc != e; ++uc) {
189                             if (uc->unicode() == 0x000A) lines++;
190                         }
191                         if (lines < 5) non_well_formed << hresource;
192                     }
193                 }
194             }
195         }
196     }
197     if (!non_well_formed.isEmpty()) {
198         QApplication::restoreOverrideCursor();
199         if (QMessageBox::Yes == QMessageBox::warning(QApplication::activeWindow(),
200                 tr("Sigil"),
201                 tr("This EPUB has HTML files that are not well formed or are "
202                    "missing a DOCTYPE, html, head or body elements. "
203                    "Sigil can automatically fix these files, although "
204                    "this may result in minor data loss in extreme circumstances.\n\n"
205                    "Do you want to automatically fix the files?"),
206                 QMessageBox::Yes|QMessageBox::No))
207         {
208             foreach(HTMLResource* htmlres, non_well_formed) {
209                 QString fixed_text = CleanSource::Mend(htmlres->GetText(),htmlres->GetEpubVersion());
210                 htmlres->SetText(fixed_text);
211             }
212             non_well_formed.clear();
213         }
214         QApplication::setOverrideCursor(Qt::WaitCursor);
215     }
216 
217     ProcessFontFiles(resources, encrypted_files);
218 
219     if (m_PackageVersion.startsWith('3')) {
220         HTMLResource * nav_resource = NULL;
221         if (m_NavResource) {
222             if (m_NavResource->Type() == Resource::HTMLResourceType) {
223                 nav_resource = qobject_cast<HTMLResource*>(m_NavResource);
224             }
225         }
226         if (!nav_resource) {
227             // we need to create a nav file here because one was not found
228             // it will automatically be added to the content.opf
229             nav_resource = m_Book->CreateEmptyNavFile(true);
230             Resource * res = qobject_cast<Resource *>(nav_resource);
231             m_Book->GetOPF()->SetItemRefLinear(res, false);
232         }
233         m_Book->GetOPF()->SetNavResource(nav_resource);
234     }
235 
236 
237     if (m_NCXNotInManifest && m_PackageVersion.startsWith('2')) {
238         // We manually created an NCX file because there wasn't one in the manifest.
239         // Need to create a new manifest id for it.
240         m_NCXId = m_Book->GetOPF()->AddNCXItem(m_NCXFilePath);
241     }
242 
243     NCXResource * ncxresource = m_Book->GetNCX();
244 
245     if (ncxresource) {
246         // Ensure that our spine has a <spine toc="ncx"> element on it now in case it was missing.
247         m_Book->GetOPF()->UpdateNCXOnSpine(m_NCXId);
248         // Make sure the <item> for the NCX in the manifest reflects correct href path
249         m_Book->GetOPF()->UpdateNCXLocationInManifest(ncxresource);
250     }
251 
252     // If spine was not present or did not contain any items, recreate the OPF from scratch
253     // preserving any important metadata elements and making a new reading order.
254     if (!m_HasSpineItems) {
255         QList<MetaEntry> originalMetadata = m_Book->GetOPF()->GetDCMetadata();
256         m_Book->GetOPF()->AutoFixWellFormedErrors();
257         if (extract_metadata) {
258             m_Book->GetOPF()->SetDCMetadata(originalMetadata);
259         }
260         AddLoadWarning(QObject::tr("The OPF file does not contain a valid spine.") % "\n" %
261                        QObject::tr("Sigil has created a new one for you."));
262     }
263 
264     // update the ShortPathNames to reflect any name duplication
265     m_Book->GetFolderKeeper()->updateShortPathNames();
266 
267     // since we no longer run universal updates we should run
268     // InitialLoad on all TextResources to make sure everything gets loaded
269     m_Book->GetFolderKeeper()->PerformInitialLoads();
270 
271     // If we have modified the book to add spine attribute, manifest item or NCX mark as changed.
272     m_Book->SetModified(GetLoadWarnings().count() > 0);
273     QApplication::restoreOverrideCursor();
274     return m_Book;
275 }
276 
277 
ParseEncryptionXml()278 QHash<QString, QString> ImportEPUB::ParseEncryptionXml()
279 {
280     QString encrpytion_xml_path = m_ExtractedFolderPath + "/META-INF/encryption.xml";
281 
282     if (!QFileInfo(encrpytion_xml_path).exists()) {
283         return QHash<QString, QString>();
284     }
285 
286     QXmlStreamReader encryption(Utility::ReadUnicodeTextFile(encrpytion_xml_path));
287     QHash<QString, QString> encrypted_files;
288     QString encryption_algo;
289     QString uri;
290 
291     while (!encryption.atEnd()) {
292         encryption.readNext();
293 
294         if (encryption.isStartElement()) {
295             if (encryption.name() == "EncryptionMethod") {
296                 encryption_algo = encryption.attributes().value("", "Algorithm").toString();
297             } else if (encryption.name() == "CipherReference") {
298                 // Note: fragments are not part of the CipherReference specs so this is okay
299                 uri = Utility::URLDecodePath(encryption.attributes().value("", "URI").toString());
300                 // hack to handle non-spec encryption file url relative to META-INF instead
301                 // of being absolute from epub root as the spec calls for
302                 if (uri.startsWith("../")) uri = uri.mid(3,-1);
303                 encrypted_files[ uri ] = encryption_algo;
304             }
305         }
306     }
307 
308     if (encryption.hasError()) {
309         const QString error = QString(QObject::tr("Error parsing encryption xml.\nLine: %1 Column %2 - %3"))
310                               .arg(encryption.lineNumber())
311                               .arg(encryption.columnNumber())
312                               .arg(encryption.errorString());
313         throw (EPUBLoadParseError(error.toStdString()));
314     }
315 
316     return encrypted_files;
317 }
318 
319 
BookContentEncrypted(const QHash<QString,QString> & encrypted_files)320 bool ImportEPUB::BookContentEncrypted(const QHash<QString, QString> &encrypted_files)
321 {
322     foreach(QString algorithm, encrypted_files.values()) {
323         if (algorithm != ADOBE_FONT_ALGO_ID &&
324             algorithm != IDPF_FONT_ALGO_ID) {
325             return true;
326         }
327     }
328     return false;
329 }
330 
331 
332 // This is basically a workaround for old versions of InDesign not listing the fonts it
333 // embedded in the OPF manifest, even though the specs say it has to.
334 // It does list them in the encryption.xml, so we use that.
AddObfuscatedButUndeclaredFonts(const QHash<QString,QString> & encrypted_files)335 void ImportEPUB::AddObfuscatedButUndeclaredFonts(const QHash<QString, QString> &encrypted_files)
336 {
337     if (encrypted_files.empty()) {
338         return;
339     }
340 
341     QDir opf_dir = QFileInfo(m_OPFFilePath).dir();
342     foreach(QString filepath, encrypted_files.keys()) {
343         if (!FONT_EXTENSIONS.contains(QFileInfo(filepath).suffix().toLower())) {
344             continue;
345         }
346 
347         // Only add the path to the manifest if it is not already included.
348         QMapIterator<QString, QString> valueSearch(m_Files);
349 
350         if (!valueSearch.findNext(opf_dir.relativeFilePath(filepath))) {
351             m_Files[ Utility::CreateUUID() ] = opf_dir.relativeFilePath(filepath);
352         }
353     }
354 }
355 
356 
357 // Another workaround for non-standard Apple files
358 // At present it only handles com.apple.ibooks.display-options.xml, but any
359 // further iBooks aberrations should be handled here as well.
AddNonStandardAppleXML()360 void ImportEPUB::AddNonStandardAppleXML()
361 {
362     QDir opf_dir = QFileInfo(m_OPFFilePath).dir();
363     QStringList aberrant_Apple_filenames;
364     aberrant_Apple_filenames.append(m_ExtractedFolderPath + "/META-INF/com.apple.ibooks.display-options.xml");
365 
366     for (int i = 0; i < aberrant_Apple_filenames.size(); ++i) {
367         if (QFile::exists(aberrant_Apple_filenames.at(i))) {
368             m_Files[ Utility::CreateUUID() ]  = opf_dir.relativeFilePath(aberrant_Apple_filenames.at(i));
369         }
370     }
371 }
372 
373 
374 // Each resource can provide us with its new path. encrypted_files provides
375 // a mapping from the resource paths to the obfuscation algorithms.
ProcessFontFiles(const QList<Resource * > & resources,const QHash<QString,QString> & encrypted_files)376 void ImportEPUB::ProcessFontFiles(const QList<Resource *> &resources,
377                                   const QHash<QString, QString> &encrypted_files)
378 {
379     if (encrypted_files.empty()) {
380         return;
381     }
382 
383     QList<FontResource *> font_resources = m_Book->GetFolderKeeper()->GetResourceTypeList<FontResource>();
384 
385     if (font_resources.empty()) {
386         return;
387     }
388 
389     foreach(FontResource * font_resource, font_resources) {
390         QString match_path = font_resource->GetRelativePath();
391         QString algorithm  = encrypted_files.value(match_path);
392 
393         if (algorithm.isEmpty()) {
394             continue;
395         }
396 
397         font_resource->SetObfuscationAlgorithm(algorithm);
398 
399         // Actually we are de-obfuscating, but the inverse operations of the obfuscation methods
400         // are the obfuscation methods themselves. For the math oriented, the obfuscation methods
401         // are involutary [ f( f( x ) ) = x ].
402         if (algorithm == ADOBE_FONT_ALGO_ID) {
403             FontObfuscation::ObfuscateFile(font_resource->GetFullPath(), algorithm, m_UuidIdentifierValue);
404         } else {
405             FontObfuscation::ObfuscateFile(font_resource->GetFullPath(), algorithm, m_UniqueIdentifierValue);
406         }
407     }
408 }
409 
ExtractContainer()410 void ImportEPUB::ExtractContainer()
411 {
412     int res = 0;
413     if (!cp437) {
414         cp437 = new QCodePage437Codec();
415     }
416 #ifdef Q_OS_WIN32
417     zlib_filefunc64_def ffunc;
418     fill_win32_filefunc64W(&ffunc);
419     unzFile zfile = unzOpen2_64(Utility::QStringToStdWString(QDir::toNativeSeparators(m_FullFilePath)).c_str(), &ffunc);
420 #else
421     unzFile zfile = unzOpen64(QDir::toNativeSeparators(m_FullFilePath).toUtf8().constData());
422 #endif
423 
424     if (zfile == NULL) {
425         throw (EPUBLoadParseError(QString(QObject::tr("Cannot unzip EPUB: %1")).arg(QDir::toNativeSeparators(m_FullFilePath)).toStdString()));
426     }
427 
428     res = unzGoToFirstFile(zfile);
429 
430     if (res == UNZ_OK) {
431         do {
432             // Get the name of the file in the archive.
433             char file_name[MAX_PATH] = {0};
434             unz_file_info64 file_info;
435             unzGetCurrentFileInfo64(zfile, &file_info, file_name, MAX_PATH, NULL, 0, NULL, 0);
436             QString qfile_name;
437             QString cp437_file_name;
438             qfile_name = QString::fromUtf8(file_name);
439             if (!(file_info.flag & (1<<11))) {
440                 // General purpose bit 11 says the filename is utf-8 encoded. If not set then
441                 // IBM 437 encoding might be used.
442                 cp437_file_name = cp437->toUnicode(file_name);
443             }
444 
445             // If there is no file name then we can't do anything with it.
446             if (!qfile_name.isEmpty()) {
447 
448                 // for security reasons against maliciously crafted zip archives
449                 // we need the file path to always be inside the target folder
450                 // and not outside, so we will remove all illegal backslashes
451                 // and all relative upward paths segments "/../" from the zip's local
452                 // file name/path before prepending the target folder to create
453                 // the final path
454 
455                 QString original_path = qfile_name;
456                 bool evil_or_corrupt_epub = false;
457 
458                 if (qfile_name.contains("\\")) evil_or_corrupt_epub = true;
459                 qfile_name = "/" + qfile_name.replace("\\","");
460 
461                 if (qfile_name.contains("/../")) evil_or_corrupt_epub = true;
462                 qfile_name = qfile_name.replace("/../","/");
463 
464                 while(qfile_name.startsWith("/")) {
465                     qfile_name = qfile_name.remove(0,1);
466                 }
467 
468                 if (cp437_file_name.contains("\\")) evil_or_corrupt_epub = true;
469                 cp437_file_name = "/" + cp437_file_name.replace("\\","");
470 
471                 if (cp437_file_name.contains("/../")) evil_or_corrupt_epub = true;
472                 cp437_file_name = cp437_file_name.replace("/../","/");
473 
474                 while(cp437_file_name.startsWith("/")) {
475                     cp437_file_name = cp437_file_name.remove(0,1);
476                 }
477 
478                 if (evil_or_corrupt_epub) {
479                     unzCloseCurrentFile(zfile);
480                     unzClose(zfile);
481                     throw (EPUBLoadParseError(QString(QObject::tr("Possible evil or corrupt epub file name: %1")).arg(original_path).toStdString()));
482                 }
483 
484                 // We use the dir object to create the path in the temporary directory.
485                 // Unfortunately, we need a dir ojbect to do this as it's not a static function.
486                 QDir dir(m_ExtractedFolderPath);
487                 // Full file path in the temporary directory.
488                 QString file_path = m_ExtractedFolderPath + "/" + qfile_name;
489                 QFileInfo qfile_info(file_path);
490 
491                 // Is this entry a directory?
492                 if (file_info.uncompressed_size == 0 && qfile_name.endsWith('/')) {
493                     dir.mkpath(qfile_name);
494                     continue;
495                 } else {
496                     if (!qfile_info.path().isEmpty()) dir.mkpath(qfile_info.path());
497                     // add it to the list of files found inside the zip
498                     if (cp437_file_name.isEmpty()) {
499                         m_ZipFilePaths << qfile_name;
500                     } else {
501                         m_ZipFilePaths << cp437_file_name;
502                     }
503                 }
504 
505                 // Open the file entry in the archive for reading.
506                 if (unzOpenCurrentFile(zfile) != UNZ_OK) {
507                     unzClose(zfile);
508                     throw (EPUBLoadParseError(QString(QObject::tr("Cannot extract file: %1")).arg(qfile_name).toStdString()));
509                 }
510 
511                 // Open the file on disk to write the entry in the archive to.
512                 QFile entry(file_path);
513 
514                 if (!entry.open(QIODevice::WriteOnly | QIODevice::Truncate)) {
515                     unzCloseCurrentFile(zfile);
516                     unzClose(zfile);
517                     throw (EPUBLoadParseError(QString(QObject::tr("Cannot extract file: %1")).arg(qfile_name).toStdString()));
518                 }
519 
520                 // Buffered reading and writing.
521                 char buff[BUFF_SIZE] = {0};
522                 int read = 0;
523 
524                 while ((read = unzReadCurrentFile(zfile, buff, BUFF_SIZE)) > 0) {
525                     entry.write(buff, read);
526                 }
527 
528                 entry.close();
529 
530                 // Read errors are marked by a negative read amount.
531                 if (read < 0) {
532                     unzCloseCurrentFile(zfile);
533                     unzClose(zfile);
534                     throw (EPUBLoadParseError(QString(QObject::tr("Cannot extract file: %1")).arg(qfile_name).toStdString()));
535                 }
536 
537                 // The file was read but the CRC did not match.
538                 // We don't check the read file size vs the uncompressed file size
539                 // because if they're different there should be a CRC error.
540                 if (unzCloseCurrentFile(zfile) == UNZ_CRCERROR) {
541                     unzClose(zfile);
542                     throw (EPUBLoadParseError(QString(QObject::tr("Cannot extract file: %1")).arg(qfile_name).toStdString()));
543                 }
544                 if (!cp437_file_name.isEmpty() && cp437_file_name != qfile_name) {
545                     QString cp437_file_path = m_ExtractedFolderPath + "/" + cp437_file_name;
546                     QFile::copy(file_path, cp437_file_path);
547                 }
548             }
549         } while ((res = unzGoToNextFile(zfile)) == UNZ_OK);
550     }
551 
552     if (res != UNZ_END_OF_LIST_OF_FILE) {
553         unzClose(zfile);
554         throw (EPUBLoadParseError(QString(QObject::tr("Cannot open EPUB: %1")).arg(QDir::toNativeSeparators(m_FullFilePath)).toStdString()));
555     }
556 
557     unzClose(zfile);
558 }
559 
LocateOPF()560 void ImportEPUB::LocateOPF()
561 {
562     QString fullpath = m_ExtractedFolderPath + "/META-INF/container.xml";
563     QXmlStreamReader container;
564     try {
565         container.addData(Utility::ReadUnicodeTextFile(fullpath));
566     } catch (CannotOpenFile&) {
567         // Find the first OPF file.
568         QString OPFfile;
569         QDirIterator files(m_ExtractedFolderPath, QStringList() << "*.opf", QDir::NoFilter, QDirIterator::Subdirectories);
570         while (files.hasNext()) {
571             OPFfile = QDir(m_ExtractedFolderPath).relativeFilePath(files.next());
572             break;
573         }
574 
575         if (OPFfile.isEmpty()) {
576             std::string msg = fullpath.toStdString() + ": " + tr("Epub has missing or improperly specified OPF.").toStdString();
577             throw (CannotOpenFile(msg));
578         }
579 
580         // Create a default container.xml.
581         QDir folder(m_ExtractedFolderPath);
582         folder.mkdir("META-INF");
583         Utility::WriteUnicodeTextFile(CONTAINER_XML.arg(OPFfile), fullpath);
584         container.addData(Utility::ReadUnicodeTextFile(fullpath));
585     }
586 
587     int num_opf = 0;
588 
589     while (!container.atEnd()) {
590         container.readNext();
591 
592         if (container.isStartElement() && container.name() == "rootfile") {
593             if (container.attributes().hasAttribute("media-type") &&
594                 container.attributes().value("", "media-type") == OEBPS_MIMETYPE) {
595                 // As per OCF spec, the first rootfile element
596                 // with the OEBPS mimetype is considered the "main" one.
597                 if (m_OPFFilePath.isEmpty()) {
598                     m_OPFFilePath = m_ExtractedFolderPath + "/" + container.attributes().value("", "full-path").toString();
599                 }
600                 num_opf++;
601 
602             }
603         }
604     }
605 
606     if (container.hasError()) {
607         const QString error = QString(
608                                   QObject::tr("Unable to parse container.xml file.\nLine: %1 Column %2 - %3"))
609                               .arg(container.lineNumber())
610                               .arg(container.columnNumber())
611                               .arg(container.errorString());
612         throw (EPUBLoadParseError(error.toStdString()));
613     }
614 
615     if (num_opf > 1) {
616         Utility::DisplayStdWarningDialog(tr("This epub has multiple renditions (multiple OPF files). Editing this epub in Sigil will produce a normal single rendition epub using only the main (first) OPF file found."),"");
617     }
618 
619     if (m_OPFFilePath.isEmpty() || !QFile::exists(m_OPFFilePath)) {
620         throw (EPUBLoadParseError(QString(QObject::tr("No appropriate OPF file found")).toStdString()));
621     }
622 }
623 
624 
ReadOPF()625 void ImportEPUB::ReadOPF()
626 {
627     QString opf_text = CleanSource::ProcessXML(PrepareOPFForReading(Utility::ReadUnicodeTextFile(m_OPFFilePath)),OEBPS_MIMETYPE);
628     QXmlStreamReader opf_reader(opf_text);
629     QString ncx_id_on_spine;
630 
631     while (!opf_reader.atEnd()) {
632         opf_reader.readNext();
633 
634         if (!opf_reader.isStartElement()) {
635             continue;
636         }
637 
638         if (opf_reader.name() == "package") {
639             m_UniqueIdentifierId = opf_reader.attributes().value("", "unique-identifier").toString();
640             m_PackageVersion = opf_reader.attributes().value("", "version").toString();
641             if (m_PackageVersion == "1.0") m_PackageVersion = "2.0";
642         }
643 
644         else if (opf_reader.name() == "identifier") {
645             ReadIdentifierElement(&opf_reader);
646         }
647 
648         // epub3 look for linked metadata resources that are included inside the epub
649         // but that are not and must not be included in the manifest
650         else if (opf_reader.name() == "link") {
651             ReadMetadataLinkElement(&opf_reader);
652         }
653 
654         // Get the list of content files that
655         // make up the publication
656         else if (opf_reader.name() == "item") {
657             ReadManifestItemElement(&opf_reader);
658         }
659 
660         // We read this just to get the NCX id
661         else if (opf_reader.name() == "spine") {
662             ncx_id_on_spine = opf_reader.attributes().value("", "toc").toString();
663         }
664 
665         else if (opf_reader.name() == "itemref") {
666             m_HasSpineItems = true;
667         }
668     }
669 
670     if (opf_reader.hasError()) {
671         const QString error = QString(QObject::tr("Unable to read OPF file.\nLine: %1 Column %2 - %3"))
672                               .arg(opf_reader.lineNumber())
673                               .arg(opf_reader.columnNumber())
674                               .arg(opf_reader.errorString());
675         throw (EPUBLoadParseError(error.toStdString()));
676     }
677 
678 
679     //Important!  The OPF Resource in the new book must be created now before adding to it in any way
680     QString bookpath;
681     bookpath = m_OPFFilePath.right(m_OPFFilePath.length() - m_ExtractedFolderPath.length() - 1);
682     m_Book->GetFolderKeeper()->AddOPFToFolder(m_PackageVersion, bookpath);
683 
684     // Ensure we have an NCX available
685     LocateOrCreateNCX(ncx_id_on_spine);
686 
687 }
688 
689 
ReadIdentifierElement(QXmlStreamReader * opf_reader)690 void ImportEPUB::ReadIdentifierElement(QXmlStreamReader *opf_reader)
691 {
692     QString id     = opf_reader->attributes().value("", "id").toString();
693     QString scheme = opf_reader->attributes().value("", "scheme").toString();
694     QString value  = opf_reader->readElementText();
695 
696     if (id == m_UniqueIdentifierId) {
697         m_UniqueIdentifierValue = value;
698     }
699 
700     if (m_UuidIdentifierValue.isEmpty() &&
701         (value.contains("urn:uuid:") || scheme.toLower() == "uuid")) {
702         m_UuidIdentifierValue = value;
703     }
704 }
705 
ReadMetadataLinkElement(QXmlStreamReader * opf_reader)706 void ImportEPUB::ReadMetadataLinkElement(QXmlStreamReader *opf_reader)
707 {
708     QString relation = opf_reader->attributes().value("", "rel").toString();
709     QString mtype = opf_reader->attributes().value("", "media-type").toString();
710     QString props = opf_reader->attributes().value("", "properties").toString();
711     QString href = opf_reader->attributes().value("", "href").toString();
712     if (!href.isEmpty()) {
713         QUrl url = QUrl(href);
714         if (url.isRelative()) {
715             // we have a local unmanifested metadata file to handle
716             // attempt to map deprecated record types into proper media-types
717             if (relation == "marc21xml-record") {
718                 mtype = "application/marcxml+xml";
719             }
720             else if (relation == "mods-record") {
721                 mtype = "application/mods+xml";
722             }
723             else if (relation == "onix-record") {
724                 mtype = "application/xml;onix";
725             }
726             else if (relation == "xmp-record") {
727                 mtype = "application/xml;xmp";
728             }
729             else if (relation == "record") {
730                 if (props == "onix") mtype = "application/xml;onix";
731                 if (props == "xmp") mtype = "application/xml;xmp";
732             }
733             QDir opf_dir = QFileInfo(m_OPFFilePath).dir();
734             QString path = opf_dir.absolutePath() + "/" + url.path();
735             if (QFile::exists(path)) {
736                 QString id = Utility::CreateUUID();
737                 m_Files[ id ]  = opf_dir.relativeFilePath(path);
738                 m_FileMimetypes[ id ] = mtype;
739             }
740         }
741     }
742 }
743 
ReadManifestItemElement(QXmlStreamReader * opf_reader)744 void ImportEPUB::ReadManifestItemElement(QXmlStreamReader *opf_reader)
745 {
746     QString id   = opf_reader->attributes().value("", "id").toString();
747     QString href = opf_reader->attributes().value("", "href").toString();
748     QString type = opf_reader->attributes().value("", "media-type").toString();
749     QString properties = opf_reader->attributes().value("", "properties").toString();
750     // FIXME: can epub3 OPF Manifest href attributes include fragments?
751     // FIXME: under epub2 fragments are explicitly outlawed in spec
752     // For robustness sake we will assume they can but ...
753     // Note:  Under epub3 they can point outside the epub so need to handle full url
754 
755     QString apath;
756     if (href.indexOf(':') == -1) {
757         // we know we have a relative href to a file so no fragments can exist
758         apath = Utility::URLDecodePath(href);
759     }
760     // for hrefs pointing outside the epub, apath will be empty
761     // qDebug() << "ImportEpub with Manifest item: " << href << apath;
762     QString extension = QFileInfo(apath).suffix().toLower();
763 
764     // validate the media type if we can, and warn otherwise
765     QString group = MediaTypes::instance()->GetGroupFromMediaType(type,"");
766     QString ext_mtype = MediaTypes::instance()->GetMediaTypeFromExtension(extension, "");
767     if (type.isEmpty() || group.isEmpty()) {
768         const QString load_warning = QObject::tr("The OPF uses an unrecognized media type \"%1\" for file \"%2\"").arg(type).arg(QFileInfo(apath).fileName()) +
769             " - " + QObject::tr("A temporary media type of \"%1\" has been assigned. You should edit your OPF file to fix this problem.").arg(ext_mtype);
770         AddLoadWarning(load_warning);
771     }
772 
773     if (!apath.isEmpty()) {
774 
775         // find the epub root relative file path from the opf location and the item href
776         QString file_path = m_opfDir.absolutePath() + "/" + apath;
777         file_path = Utility::resolveRelativeSegmentsInFilePath(file_path,"/");
778         file_path = file_path.remove(0, m_ExtractedFolderPath.length() + 1);
779 
780         // Manifest Items may *NOT* live in the META-INF and the mimetype file should NOT be manifested
781         if (file_path.startsWith("META-INF/") || (file_path == "mimetype")) {
782             const QString load_warning = QObject::tr("The OPF has an illegal Manifest entry for a file inside the META-INF folder for file \"%1\"").arg(QFileInfo(file_path).fileName()) +
783             " - " + QObject::tr("You should edit your OPF file to remove this entry.");
784             AddLoadWarning(load_warning);
785             return;
786         }
787 
788         if (type != NCX_MIMETYPE && extension != NCX_EXTENSION) {
789             if (!m_ManifestFilePaths.contains(file_path)) {
790                 if (m_Files.contains(id)) {
791                     // We have an error situation with a duplicate id in the epub.
792                     // We must warn the user, but attempt to use another id so the epub can still be loaded.
793                     QString base_id = QFileInfo(apath).fileName();
794                     QString new_id(base_id);
795                     int duplicate_index = 0;
796 
797                     while (m_Files.contains(new_id)) {
798                         duplicate_index++;
799                         new_id = QString("%1%2").arg(base_id).arg(duplicate_index);
800                     }
801 
802                     const QString load_warning = QObject::tr("The OPF manifest contains duplicate ids for: %1").arg(id) +
803                   " - " + QObject::tr("A temporary id has been assigned to load this EPUB. You should edit your OPF file to remove the duplication.");
804                     id = new_id;
805                     AddLoadWarning(load_warning);
806                 }
807 
808                 m_Files[ id ] = apath;
809                 m_FileMimetypes[ id ] = type;
810                 m_ManifestFilePaths << file_path;
811                 m_ManifestMediaTypes << type;
812 
813                 // store information about any nav document
814                 if (properties.contains("nav")) {
815                     m_NavId = id;
816                     m_NavHref = apath;
817                 }
818             }
819         } else {
820             m_NcxCandidates[ id ] = apath;
821             m_ManifestFilePaths << file_path;
822             m_ManifestMediaTypes << type;
823         }
824     }
825 }
826 
827 
LocateOrCreateNCX(const QString & ncx_id_on_spine)828 void ImportEPUB::LocateOrCreateNCX(const QString &ncx_id_on_spine)
829 {
830     QString load_warning;
831     QString ncx_href = "";
832     m_NCXId = ncx_id_on_spine;
833 
834     // handle the normal/proper case of an ncx id on the spine matching an ncx candidate that exists
835     if (!m_NCXId.isEmpty() && m_NcxCandidates.contains(m_NCXId)) {
836         QString bookpath;
837         ncx_href = m_NcxCandidates[ m_NCXId ];
838         m_NCXFilePath = QFileInfo(m_OPFFilePath).absolutePath() % "/" % ncx_href;
839         m_NCXFilePath = Utility::resolveRelativeSegmentsInFilePath(m_NCXFilePath, "/");
840         bookpath = m_NCXFilePath.right(m_NCXFilePath.length() - m_ExtractedFolderPath.length() - 1);
841         m_Book->GetFolderKeeper()->AddNCXToFolder(m_PackageVersion, bookpath);
842         m_NCXNotInManifest = false;
843         return;
844     }
845 
846     bool found = false;
847 
848     // now handle ncx not specified in spine but file with ncx extension exists in manifest
849     // Search for the ncx in the manifest by looking for files with
850    // a .ncx extension.
851     if (m_NCXId.isEmpty()) {
852 
853         QHashIterator<QString, QString> ncxSearch(m_NcxCandidates);
854         while (ncxSearch.hasNext()) {
855             ncxSearch.next();
856 
857             if (QFileInfo(ncxSearch.value()).suffix().toLower() == NCX_EXTENSION) {
858                 // we found a file with an ncx extension
859                 m_NCXId = ncxSearch.key();
860                 found = true;
861                 break;
862             }
863         }
864     }
865 
866     if (found) {
867         // m_NCXId has been properly set
868         ncx_href = m_NcxCandidates[ m_NCXId ];
869         m_NCXFilePath = QFileInfo(m_OPFFilePath).absolutePath() % "/" % ncx_href;
870         m_NCXFilePath = Utility::resolveRelativeSegmentsInFilePath(m_NCXFilePath, "/");
871 
872         QString bookpath = m_NCXFilePath.right(m_NCXFilePath.length() - m_ExtractedFolderPath.length() - 1);
873         m_Book->GetFolderKeeper()->AddNCXToFolder(m_PackageVersion, bookpath);
874         m_NCXNotInManifest = false;
875         load_warning = QObject::tr("The OPF file did not identify the NCX file correctly.") + "\n" +
876                                " - "  +  QObject::tr("Sigil has used the following file as the NCX:") +
877                                QString(" %1").arg(m_NcxCandidates[ m_NCXId ]);
878 
879         AddLoadWarning(load_warning);
880         return;
881     }
882 
883     // An NCX is only required in epub2 so punt here if epub3
884     if ( m_PackageVersion.startsWith('3') ) return;
885 
886     // epub2 only here
887 
888     // If we reached here there is no file with an ncx file extension in the manifest
889     // There might be a file with an ncx extension inside the epub zip folder but
890     // since it was unmanifested, we will not use it anyway.
891     // So we need to create a new one and thereby handle the following
892     // failure conditions:
893     //     - ncx specified in spine, but no matching manifest item entry
894     //     - ncx file not physically present
895     //     - ncx not in spine or manifest item
896 
897     m_NCXNotInManifest = true;
898 
899     load_warning = QObject::tr("The OPF file does not contain an NCX file.") + "\n" +
900                                " - " +  QObject::tr("Sigil has created a new one for you.");
901 
902     m_NCXFilePath = QFileInfo(m_OPFFilePath).absolutePath() + "/toc.ncx";
903 
904     // Create a new file for the NCX in the *Extracted Folder* Path
905     // We are relying on an identifier being set from the metadata.
906     // It might not have one if the book does not have the urn:uuid: format.
907     NCXResource ncx_resource(m_ExtractedFolderPath, m_NCXFilePath, m_PackageVersion, NULL);
908     ncx_resource.SetEpubVersion(m_PackageVersion);
909     // put it beside the OPF file
910     ncx_resource.FillWithDefaultText(m_PackageVersion, QFileInfo(m_OPFFilePath).absolutePath());
911     if (!m_UuidIdentifierValue.isEmpty()) {
912         ncx_resource.SetMainID(m_UuidIdentifierValue);
913     }
914     ncx_resource.SaveToDisk();
915 
916     // now add the NCX to our folder
917     QString bookpath = m_NCXFilePath.right(m_NCXFilePath.length() - m_ExtractedFolderPath.length() - 1);
918     m_Book->GetFolderKeeper()->AddNCXToFolder(m_PackageVersion, bookpath);
919 
920     if (!load_warning.isEmpty()) {
921         AddLoadWarning(load_warning);
922     }
923 }
924 
925 
LoadInfrastructureFiles()926 void ImportEPUB::LoadInfrastructureFiles()
927 {
928     // always SetEpubVersion before SetText in OPF as SetText will validate with it
929     m_Book->GetOPF()->SetEpubVersion(m_PackageVersion);
930     m_Book->GetOPF()->SetText(CleanSource::ProcessXML(PrepareOPFForReading(Utility::ReadUnicodeTextFile(m_OPFFilePath)),OEBPS_MIMETYPE));
931     QString OPFBookRelPath = m_OPFFilePath;
932     OPFBookRelPath = OPFBookRelPath.remove(0,m_ExtractedFolderPath.length()+1);
933     m_Book->GetOPF()->SetCurrentBookRelPath(OPFBookRelPath);
934     NCXResource * ncxresource = m_Book->GetNCX();
935     if (ncxresource) {
936         ncxresource->SetEpubVersion(m_PackageVersion);
937         ncxresource->SetText(CleanSource::ProcessXML(Utility::ReadUnicodeTextFile(m_NCXFilePath),"application/x-dtbncx+xml"));
938         QString NCXBookRelPath = m_NCXFilePath;
939         NCXBookRelPath = NCXBookRelPath.remove(0,m_ExtractedFolderPath.length()+1);
940         ncxresource->SetCurrentBookRelPath(NCXBookRelPath);
941     }
942 }
943 
944 
LoadFolderStructure()945 bool ImportEPUB::LoadFolderStructure()
946 {
947     QList<QString> keys = m_Files.keys();
948     int num_files = keys.count();
949     bool success = true;
950 
951     QFutureSynchronizer<std::tuple<QString, QString>> sync;
952 
953     for (int i = 0; i < num_files; ++i) {
954         QString id = keys.at(i);
955         sync.addFuture(QtConcurrent::run(
956                            this,
957                            &ImportEPUB::LoadOneFile,
958                            m_Files.value(id),
959                            m_FileMimetypes.value(id)));
960     }
961 
962     sync.waitForFinished();
963     QList<QFuture<std::tuple<QString, QString>>> futures = sync.futures();
964     int num_futures = futures.count();
965 
966     for (int i = 0; i < num_futures; ++i) {
967         std::tuple<QString, QString> result = futures.at(i).result();
968         if (std::get<0>(result) != std::get<1>(result)) {
969             qDebug() << "LoadFolderStructure Issue: " << std::get<0>(result) << std::get<1>(result);
970             success = false;
971         }
972     }
973 
974     return success;
975 }
976 
977 
LoadOneFile(const QString & path,const QString & mimetype)978 std::tuple<QString, QString> ImportEPUB::LoadOneFile(const QString &path, const QString &mimetype)
979 {
980     // Use opf relative href to create the book path (currentpath) for this file
981     QString fullfilepath = QDir::cleanPath(QFileInfo(m_OPFFilePath).absolutePath() + "/" + path);
982     QString currentpath = fullfilepath;
983     currentpath = currentpath.remove(0,m_ExtractedFolderPath.length()+1);
984     try {
985         QString bookpath = currentpath;
986         Resource *resource = m_Book->GetFolderKeeper()->AddContentFileToFolder(fullfilepath, false, mimetype, bookpath);
987         if (path == m_NavHref) {
988             m_NavResource = resource;
989         }
990         QString newpath = resource->GetRelativePath();
991         return std::make_tuple(currentpath, newpath);
992     } catch (FileDoesNotExist&) {
993         return std::make_tuple(UPDATE_ERROR_STRING, UPDATE_ERROR_STRING);
994     }
995 }
996 
997 
PrepareOPFForReading(const QString & source)998 QString ImportEPUB::PrepareOPFForReading(const QString &source)
999 {
1000     QString source_copy(source);
1001     QString prefix = source_copy.left(XML_DECLARATION_SEARCH_PREFIX_SIZE);
1002     QRegularExpression version(VERSION_ATTRIBUTE);
1003     QRegularExpressionMatch mo = version.match(prefix);
1004     if (mo.hasMatch()) {
1005         // MASSIVE hack for XML 1.1 "support";
1006         // this is only for people who specify
1007         // XML 1.1 when they actually only use XML 1.0
1008         source_copy.replace(mo.capturedStart(1), mo.capturedLength(1), "1.0");
1009     }
1010     return source_copy;
1011 }
1012