1 /************************************************************************
2 **
3 ** Copyright (C) 2016-2021 Kevin B. Hendricks, Stratford, Ontario, Canada
4 ** Copyright (C) 2012 John Schember <john@nachtimwald.com>
5 ** Copyright (C) 2009-2011 Strahinja Markovic <strahinja.markovic@gmail.com>
6 **
7 ** This file is part of Sigil.
8 **
9 ** Sigil is free software: you can redistribute it and/or modify
10 ** it under the terms of the GNU General Public License as published by
11 ** the Free Software Foundation, either version 3 of the License, or
12 ** (at your option) any later version.
13 **
14 ** Sigil is distributed in the hope that it will be useful,
15 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ** GNU General Public License for more details.
18 **
19 ** You should have received a copy of the GNU General Public License
20 ** along with Sigil. If not, see <http://www.gnu.org/licenses/>.
21 **
22 *************************************************************************/
23
24 #ifdef _WIN32
25 #define NOMINMAX
26 #endif
27
28 #include "unzip.h"
29 #ifdef _WIN32
30 #include "iowin32.h"
31 #endif
32
33 #include <string>
34
35 #include <QApplication>
36 #include <QtCore/QtCore>
37 #include <QtCore/QDir>
38 #include <QtCore/QFile>
39 #include <QtCore/QFileInfo>
40 #include <QtCore/QFutureSynchronizer>
41 #include <QtConcurrent/QtConcurrent>
42 #include <QtCore/QXmlStreamReader>
43 #include <QDirIterator>
44 #include <QRegularExpression>
45 #include <QRegularExpressionMatch>
46 #include <QStringList>
47 #include <QMessageBox>
48 #include <QUrl>
49 #include <QDebug>
50
51 #include "BookManipulation/FolderKeeper.h"
52 #include "BookManipulation/CleanSource.h"
53 #include "Importers/ImportEPUB.h"
54 #include "Misc/MediaTypes.h"
55 #include "Misc/FontObfuscation.h"
56 #include "Misc/HTMLEncodingResolver.h"
57 #include "Misc/QCodePage437Codec.h"
58 #include "Misc/SettingsStore.h"
59 #include "Misc/Utility.h"
60 #include "ResourceObjects/CSSResource.h"
61 #include "ResourceObjects/HTMLResource.h"
62 #include "ResourceObjects/OPFResource.h"
63 #include "ResourceObjects/NCXResource.h"
64 #include "ResourceObjects/Resource.h"
65 #include "ResourceObjects/OPFParser.h"
66 #include "sigil_constants.h"
67 #include "sigil_exception.h"
68
69 #ifndef MAX_PATH
70 // Set Max length to 256 because that's the max path size on many systems.
71 #define MAX_PATH 256
72 #endif
73 // This is the same read buffer size used by Java and Perl.
74 #define BUFF_SIZE 8192
75
76 const QString DUBLIN_CORE_NS = "http://purl.org/dc/elements/1.1/";
77 static const QString OEBPS_MIMETYPE = "application/oebps-package+xml";
78 static const QString UPDATE_ERROR_STRING = "SG_ERROR";
79 const QString NCX_MIMETYPE = "application/x-dtbncx+xml";
80 static const QString NCX_EXTENSION = "ncx";
81 const QString ADOBE_FONT_ALGO_ID = "http://ns.adobe.com/pdf/enc#RC";
82 const QString IDPF_FONT_ALGO_ID = "http://www.idpf.org/2008/embedding";
83 static const QString CONTAINER_XML = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
84 "<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\n"
85 " <rootfiles>\n"
86 " <rootfile full-path=\"%1\" media-type=\"application/oebps-package+xml\"/>\n"
87 " </rootfiles>\n"
88 "</container>\n";
89
90 static QCodePage437Codec *cp437 = 0;
91
92 // Constructor;
93 // The parameter is the file to be imported
ImportEPUB(const QString & fullfilepath)94 ImportEPUB::ImportEPUB(const QString &fullfilepath)
95 : Importer(fullfilepath),
96 m_ExtractedFolderPath(m_TempFolder.GetPath()),
97 m_HasSpineItems(false),
98 m_NCXNotInManifest(false),
99 m_NavResource(NULL)
100 {
101 }
102
103 // Reads and parses the file
104 // and returns the created Book
GetBook(bool extract_metadata)105 QSharedPointer<Book> ImportEPUB::GetBook(bool extract_metadata)
106 {
107 QList<HTMLResource *> non_well_formed;
108 SettingsStore ss;
109
110 if (!Utility::IsFileReadable(m_FullFilePath)) {
111 throw (EPUBLoadParseError(QString(QObject::tr("Cannot read EPUB: %1")).arg(QDir::toNativeSeparators(m_FullFilePath)).toStdString()));
112 }
113
114 // These read the EPUB file
115 ExtractContainer();
116 QHash<QString, QString> encrypted_files = ParseEncryptionXml();
117
118 if (BookContentEncrypted(encrypted_files)) {
119 throw (FileEncryptedWithDrm(""));
120 }
121
122 QApplication::setOverrideCursor(Qt::WaitCursor);
123
124 LocateOPF();
125 m_opfDir = QFileInfo(m_OPFFilePath).dir();
126 // These mutate the m_Book object
127 ReadOPF();
128 AddObfuscatedButUndeclaredFonts(encrypted_files);
129 AddNonStandardAppleXML();
130
131 m_Book->GetFolderKeeper()->SetGroupFolders(m_ManifestFilePaths, m_ManifestMediaTypes);
132
133 LoadInfrastructureFiles();
134
135 // Check for files missing in the Manifest and create warning
136 QStringList notInManifest;
137 foreach(QString file_path, m_ZipFilePaths) {
138 // skip mimetype and anything in META-INF and the opf itself
139 if (file_path == "mimetype") continue;
140 if (file_path.startsWith("META-INF")) continue;
141 if (m_OPFFilePath.contains(file_path)) continue;
142 if (!m_ManifestFilePaths.contains(file_path)) {
143 notInManifest << file_path;
144 }
145 }
146
147 if (!notInManifest.isEmpty()) {
148 QApplication::restoreOverrideCursor();
149 Utility::DisplayStdWarningDialog(tr("Files exist in epub that are not listed in the manifest, they will be ignored"), notInManifest.join("\n"));
150 QApplication::setOverrideCursor(Qt::WaitCursor);
151 }
152
153 LoadFolderStructure();
154
155 const QList<Resource *> resources = m_Book->GetFolderKeeper()->GetResourceList();
156
157 // We're going to check all html files until we find one that isn't well formed then we'll prompt
158 // the user if they want to auto fix or not.
159 //
160 // If we have non-well formed content and they shouldn't be auto fixed we'll pass that on to
161 // the universal update function so it knows to skip them. Otherwise we won't include them and
162 // let it modify the file.
163 for (int i=0; i<resources.count(); ++i) {
164 if (resources.at(i)->Type() == Resource::HTMLResourceType) {
165 HTMLResource *hresource = qobject_cast<HTMLResource *>(resources.at(i));
166 if (!hresource) {
167 continue;
168 }
169 // Load the content into the HTMLResource so we can perform a well formed check.
170 try {
171 hresource->SetText(HTMLEncodingResolver::ReadHTMLFile(hresource->GetFullPath()));
172 } catch (...) {
173 if (ss.cleanOn() & CLEANON_OPEN) {
174 non_well_formed << hresource;
175 continue;
176 }
177 }
178 if (ss.cleanOn() & CLEANON_OPEN) {
179 if (!XhtmlDoc::IsDataWellFormed(hresource->GetText(),hresource->GetEpubVersion())) {
180 non_well_formed << hresource;
181 } else {
182 QString txt = hresource->GetText();
183 // had cases of large files with no line breaks
184 if (txt.size() > 307200) {
185 int lines = 0;
186 QChar *uc = txt.data();
187 QChar *e = uc + txt.size();
188 for (; uc != e; ++uc) {
189 if (uc->unicode() == 0x000A) lines++;
190 }
191 if (lines < 5) non_well_formed << hresource;
192 }
193 }
194 }
195 }
196 }
197 if (!non_well_formed.isEmpty()) {
198 QApplication::restoreOverrideCursor();
199 if (QMessageBox::Yes == QMessageBox::warning(QApplication::activeWindow(),
200 tr("Sigil"),
201 tr("This EPUB has HTML files that are not well formed or are "
202 "missing a DOCTYPE, html, head or body elements. "
203 "Sigil can automatically fix these files, although "
204 "this may result in minor data loss in extreme circumstances.\n\n"
205 "Do you want to automatically fix the files?"),
206 QMessageBox::Yes|QMessageBox::No))
207 {
208 foreach(HTMLResource* htmlres, non_well_formed) {
209 QString fixed_text = CleanSource::Mend(htmlres->GetText(),htmlres->GetEpubVersion());
210 htmlres->SetText(fixed_text);
211 }
212 non_well_formed.clear();
213 }
214 QApplication::setOverrideCursor(Qt::WaitCursor);
215 }
216
217 ProcessFontFiles(resources, encrypted_files);
218
219 if (m_PackageVersion.startsWith('3')) {
220 HTMLResource * nav_resource = NULL;
221 if (m_NavResource) {
222 if (m_NavResource->Type() == Resource::HTMLResourceType) {
223 nav_resource = qobject_cast<HTMLResource*>(m_NavResource);
224 }
225 }
226 if (!nav_resource) {
227 // we need to create a nav file here because one was not found
228 // it will automatically be added to the content.opf
229 nav_resource = m_Book->CreateEmptyNavFile(true);
230 Resource * res = qobject_cast<Resource *>(nav_resource);
231 m_Book->GetOPF()->SetItemRefLinear(res, false);
232 }
233 m_Book->GetOPF()->SetNavResource(nav_resource);
234 }
235
236
237 if (m_NCXNotInManifest && m_PackageVersion.startsWith('2')) {
238 // We manually created an NCX file because there wasn't one in the manifest.
239 // Need to create a new manifest id for it.
240 m_NCXId = m_Book->GetOPF()->AddNCXItem(m_NCXFilePath);
241 }
242
243 NCXResource * ncxresource = m_Book->GetNCX();
244
245 if (ncxresource) {
246 // Ensure that our spine has a <spine toc="ncx"> element on it now in case it was missing.
247 m_Book->GetOPF()->UpdateNCXOnSpine(m_NCXId);
248 // Make sure the <item> for the NCX in the manifest reflects correct href path
249 m_Book->GetOPF()->UpdateNCXLocationInManifest(ncxresource);
250 }
251
252 // If spine was not present or did not contain any items, recreate the OPF from scratch
253 // preserving any important metadata elements and making a new reading order.
254 if (!m_HasSpineItems) {
255 QList<MetaEntry> originalMetadata = m_Book->GetOPF()->GetDCMetadata();
256 m_Book->GetOPF()->AutoFixWellFormedErrors();
257 if (extract_metadata) {
258 m_Book->GetOPF()->SetDCMetadata(originalMetadata);
259 }
260 AddLoadWarning(QObject::tr("The OPF file does not contain a valid spine.") % "\n" %
261 QObject::tr("Sigil has created a new one for you."));
262 }
263
264 // update the ShortPathNames to reflect any name duplication
265 m_Book->GetFolderKeeper()->updateShortPathNames();
266
267 // since we no longer run universal updates we should run
268 // InitialLoad on all TextResources to make sure everything gets loaded
269 m_Book->GetFolderKeeper()->PerformInitialLoads();
270
271 // If we have modified the book to add spine attribute, manifest item or NCX mark as changed.
272 m_Book->SetModified(GetLoadWarnings().count() > 0);
273 QApplication::restoreOverrideCursor();
274 return m_Book;
275 }
276
277
ParseEncryptionXml()278 QHash<QString, QString> ImportEPUB::ParseEncryptionXml()
279 {
280 QString encrpytion_xml_path = m_ExtractedFolderPath + "/META-INF/encryption.xml";
281
282 if (!QFileInfo(encrpytion_xml_path).exists()) {
283 return QHash<QString, QString>();
284 }
285
286 QXmlStreamReader encryption(Utility::ReadUnicodeTextFile(encrpytion_xml_path));
287 QHash<QString, QString> encrypted_files;
288 QString encryption_algo;
289 QString uri;
290
291 while (!encryption.atEnd()) {
292 encryption.readNext();
293
294 if (encryption.isStartElement()) {
295 if (encryption.name() == "EncryptionMethod") {
296 encryption_algo = encryption.attributes().value("", "Algorithm").toString();
297 } else if (encryption.name() == "CipherReference") {
298 // Note: fragments are not part of the CipherReference specs so this is okay
299 uri = Utility::URLDecodePath(encryption.attributes().value("", "URI").toString());
300 // hack to handle non-spec encryption file url relative to META-INF instead
301 // of being absolute from epub root as the spec calls for
302 if (uri.startsWith("../")) uri = uri.mid(3,-1);
303 encrypted_files[ uri ] = encryption_algo;
304 }
305 }
306 }
307
308 if (encryption.hasError()) {
309 const QString error = QString(QObject::tr("Error parsing encryption xml.\nLine: %1 Column %2 - %3"))
310 .arg(encryption.lineNumber())
311 .arg(encryption.columnNumber())
312 .arg(encryption.errorString());
313 throw (EPUBLoadParseError(error.toStdString()));
314 }
315
316 return encrypted_files;
317 }
318
319
BookContentEncrypted(const QHash<QString,QString> & encrypted_files)320 bool ImportEPUB::BookContentEncrypted(const QHash<QString, QString> &encrypted_files)
321 {
322 foreach(QString algorithm, encrypted_files.values()) {
323 if (algorithm != ADOBE_FONT_ALGO_ID &&
324 algorithm != IDPF_FONT_ALGO_ID) {
325 return true;
326 }
327 }
328 return false;
329 }
330
331
332 // This is basically a workaround for old versions of InDesign not listing the fonts it
333 // embedded in the OPF manifest, even though the specs say it has to.
334 // It does list them in the encryption.xml, so we use that.
AddObfuscatedButUndeclaredFonts(const QHash<QString,QString> & encrypted_files)335 void ImportEPUB::AddObfuscatedButUndeclaredFonts(const QHash<QString, QString> &encrypted_files)
336 {
337 if (encrypted_files.empty()) {
338 return;
339 }
340
341 QDir opf_dir = QFileInfo(m_OPFFilePath).dir();
342 foreach(QString filepath, encrypted_files.keys()) {
343 if (!FONT_EXTENSIONS.contains(QFileInfo(filepath).suffix().toLower())) {
344 continue;
345 }
346
347 // Only add the path to the manifest if it is not already included.
348 QMapIterator<QString, QString> valueSearch(m_Files);
349
350 if (!valueSearch.findNext(opf_dir.relativeFilePath(filepath))) {
351 m_Files[ Utility::CreateUUID() ] = opf_dir.relativeFilePath(filepath);
352 }
353 }
354 }
355
356
357 // Another workaround for non-standard Apple files
358 // At present it only handles com.apple.ibooks.display-options.xml, but any
359 // further iBooks aberrations should be handled here as well.
AddNonStandardAppleXML()360 void ImportEPUB::AddNonStandardAppleXML()
361 {
362 QDir opf_dir = QFileInfo(m_OPFFilePath).dir();
363 QStringList aberrant_Apple_filenames;
364 aberrant_Apple_filenames.append(m_ExtractedFolderPath + "/META-INF/com.apple.ibooks.display-options.xml");
365
366 for (int i = 0; i < aberrant_Apple_filenames.size(); ++i) {
367 if (QFile::exists(aberrant_Apple_filenames.at(i))) {
368 m_Files[ Utility::CreateUUID() ] = opf_dir.relativeFilePath(aberrant_Apple_filenames.at(i));
369 }
370 }
371 }
372
373
374 // Each resource can provide us with its new path. encrypted_files provides
375 // a mapping from the resource paths to the obfuscation algorithms.
ProcessFontFiles(const QList<Resource * > & resources,const QHash<QString,QString> & encrypted_files)376 void ImportEPUB::ProcessFontFiles(const QList<Resource *> &resources,
377 const QHash<QString, QString> &encrypted_files)
378 {
379 if (encrypted_files.empty()) {
380 return;
381 }
382
383 QList<FontResource *> font_resources = m_Book->GetFolderKeeper()->GetResourceTypeList<FontResource>();
384
385 if (font_resources.empty()) {
386 return;
387 }
388
389 foreach(FontResource * font_resource, font_resources) {
390 QString match_path = font_resource->GetRelativePath();
391 QString algorithm = encrypted_files.value(match_path);
392
393 if (algorithm.isEmpty()) {
394 continue;
395 }
396
397 font_resource->SetObfuscationAlgorithm(algorithm);
398
399 // Actually we are de-obfuscating, but the inverse operations of the obfuscation methods
400 // are the obfuscation methods themselves. For the math oriented, the obfuscation methods
401 // are involutary [ f( f( x ) ) = x ].
402 if (algorithm == ADOBE_FONT_ALGO_ID) {
403 FontObfuscation::ObfuscateFile(font_resource->GetFullPath(), algorithm, m_UuidIdentifierValue);
404 } else {
405 FontObfuscation::ObfuscateFile(font_resource->GetFullPath(), algorithm, m_UniqueIdentifierValue);
406 }
407 }
408 }
409
ExtractContainer()410 void ImportEPUB::ExtractContainer()
411 {
412 int res = 0;
413 if (!cp437) {
414 cp437 = new QCodePage437Codec();
415 }
416 #ifdef Q_OS_WIN32
417 zlib_filefunc64_def ffunc;
418 fill_win32_filefunc64W(&ffunc);
419 unzFile zfile = unzOpen2_64(Utility::QStringToStdWString(QDir::toNativeSeparators(m_FullFilePath)).c_str(), &ffunc);
420 #else
421 unzFile zfile = unzOpen64(QDir::toNativeSeparators(m_FullFilePath).toUtf8().constData());
422 #endif
423
424 if (zfile == NULL) {
425 throw (EPUBLoadParseError(QString(QObject::tr("Cannot unzip EPUB: %1")).arg(QDir::toNativeSeparators(m_FullFilePath)).toStdString()));
426 }
427
428 res = unzGoToFirstFile(zfile);
429
430 if (res == UNZ_OK) {
431 do {
432 // Get the name of the file in the archive.
433 char file_name[MAX_PATH] = {0};
434 unz_file_info64 file_info;
435 unzGetCurrentFileInfo64(zfile, &file_info, file_name, MAX_PATH, NULL, 0, NULL, 0);
436 QString qfile_name;
437 QString cp437_file_name;
438 qfile_name = QString::fromUtf8(file_name);
439 if (!(file_info.flag & (1<<11))) {
440 // General purpose bit 11 says the filename is utf-8 encoded. If not set then
441 // IBM 437 encoding might be used.
442 cp437_file_name = cp437->toUnicode(file_name);
443 }
444
445 // If there is no file name then we can't do anything with it.
446 if (!qfile_name.isEmpty()) {
447
448 // for security reasons against maliciously crafted zip archives
449 // we need the file path to always be inside the target folder
450 // and not outside, so we will remove all illegal backslashes
451 // and all relative upward paths segments "/../" from the zip's local
452 // file name/path before prepending the target folder to create
453 // the final path
454
455 QString original_path = qfile_name;
456 bool evil_or_corrupt_epub = false;
457
458 if (qfile_name.contains("\\")) evil_or_corrupt_epub = true;
459 qfile_name = "/" + qfile_name.replace("\\","");
460
461 if (qfile_name.contains("/../")) evil_or_corrupt_epub = true;
462 qfile_name = qfile_name.replace("/../","/");
463
464 while(qfile_name.startsWith("/")) {
465 qfile_name = qfile_name.remove(0,1);
466 }
467
468 if (cp437_file_name.contains("\\")) evil_or_corrupt_epub = true;
469 cp437_file_name = "/" + cp437_file_name.replace("\\","");
470
471 if (cp437_file_name.contains("/../")) evil_or_corrupt_epub = true;
472 cp437_file_name = cp437_file_name.replace("/../","/");
473
474 while(cp437_file_name.startsWith("/")) {
475 cp437_file_name = cp437_file_name.remove(0,1);
476 }
477
478 if (evil_or_corrupt_epub) {
479 unzCloseCurrentFile(zfile);
480 unzClose(zfile);
481 throw (EPUBLoadParseError(QString(QObject::tr("Possible evil or corrupt epub file name: %1")).arg(original_path).toStdString()));
482 }
483
484 // We use the dir object to create the path in the temporary directory.
485 // Unfortunately, we need a dir ojbect to do this as it's not a static function.
486 QDir dir(m_ExtractedFolderPath);
487 // Full file path in the temporary directory.
488 QString file_path = m_ExtractedFolderPath + "/" + qfile_name;
489 QFileInfo qfile_info(file_path);
490
491 // Is this entry a directory?
492 if (file_info.uncompressed_size == 0 && qfile_name.endsWith('/')) {
493 dir.mkpath(qfile_name);
494 continue;
495 } else {
496 if (!qfile_info.path().isEmpty()) dir.mkpath(qfile_info.path());
497 // add it to the list of files found inside the zip
498 if (cp437_file_name.isEmpty()) {
499 m_ZipFilePaths << qfile_name;
500 } else {
501 m_ZipFilePaths << cp437_file_name;
502 }
503 }
504
505 // Open the file entry in the archive for reading.
506 if (unzOpenCurrentFile(zfile) != UNZ_OK) {
507 unzClose(zfile);
508 throw (EPUBLoadParseError(QString(QObject::tr("Cannot extract file: %1")).arg(qfile_name).toStdString()));
509 }
510
511 // Open the file on disk to write the entry in the archive to.
512 QFile entry(file_path);
513
514 if (!entry.open(QIODevice::WriteOnly | QIODevice::Truncate)) {
515 unzCloseCurrentFile(zfile);
516 unzClose(zfile);
517 throw (EPUBLoadParseError(QString(QObject::tr("Cannot extract file: %1")).arg(qfile_name).toStdString()));
518 }
519
520 // Buffered reading and writing.
521 char buff[BUFF_SIZE] = {0};
522 int read = 0;
523
524 while ((read = unzReadCurrentFile(zfile, buff, BUFF_SIZE)) > 0) {
525 entry.write(buff, read);
526 }
527
528 entry.close();
529
530 // Read errors are marked by a negative read amount.
531 if (read < 0) {
532 unzCloseCurrentFile(zfile);
533 unzClose(zfile);
534 throw (EPUBLoadParseError(QString(QObject::tr("Cannot extract file: %1")).arg(qfile_name).toStdString()));
535 }
536
537 // The file was read but the CRC did not match.
538 // We don't check the read file size vs the uncompressed file size
539 // because if they're different there should be a CRC error.
540 if (unzCloseCurrentFile(zfile) == UNZ_CRCERROR) {
541 unzClose(zfile);
542 throw (EPUBLoadParseError(QString(QObject::tr("Cannot extract file: %1")).arg(qfile_name).toStdString()));
543 }
544 if (!cp437_file_name.isEmpty() && cp437_file_name != qfile_name) {
545 QString cp437_file_path = m_ExtractedFolderPath + "/" + cp437_file_name;
546 QFile::copy(file_path, cp437_file_path);
547 }
548 }
549 } while ((res = unzGoToNextFile(zfile)) == UNZ_OK);
550 }
551
552 if (res != UNZ_END_OF_LIST_OF_FILE) {
553 unzClose(zfile);
554 throw (EPUBLoadParseError(QString(QObject::tr("Cannot open EPUB: %1")).arg(QDir::toNativeSeparators(m_FullFilePath)).toStdString()));
555 }
556
557 unzClose(zfile);
558 }
559
LocateOPF()560 void ImportEPUB::LocateOPF()
561 {
562 QString fullpath = m_ExtractedFolderPath + "/META-INF/container.xml";
563 QXmlStreamReader container;
564 try {
565 container.addData(Utility::ReadUnicodeTextFile(fullpath));
566 } catch (CannotOpenFile&) {
567 // Find the first OPF file.
568 QString OPFfile;
569 QDirIterator files(m_ExtractedFolderPath, QStringList() << "*.opf", QDir::NoFilter, QDirIterator::Subdirectories);
570 while (files.hasNext()) {
571 OPFfile = QDir(m_ExtractedFolderPath).relativeFilePath(files.next());
572 break;
573 }
574
575 if (OPFfile.isEmpty()) {
576 std::string msg = fullpath.toStdString() + ": " + tr("Epub has missing or improperly specified OPF.").toStdString();
577 throw (CannotOpenFile(msg));
578 }
579
580 // Create a default container.xml.
581 QDir folder(m_ExtractedFolderPath);
582 folder.mkdir("META-INF");
583 Utility::WriteUnicodeTextFile(CONTAINER_XML.arg(OPFfile), fullpath);
584 container.addData(Utility::ReadUnicodeTextFile(fullpath));
585 }
586
587 int num_opf = 0;
588
589 while (!container.atEnd()) {
590 container.readNext();
591
592 if (container.isStartElement() && container.name() == "rootfile") {
593 if (container.attributes().hasAttribute("media-type") &&
594 container.attributes().value("", "media-type") == OEBPS_MIMETYPE) {
595 // As per OCF spec, the first rootfile element
596 // with the OEBPS mimetype is considered the "main" one.
597 if (m_OPFFilePath.isEmpty()) {
598 m_OPFFilePath = m_ExtractedFolderPath + "/" + container.attributes().value("", "full-path").toString();
599 }
600 num_opf++;
601
602 }
603 }
604 }
605
606 if (container.hasError()) {
607 const QString error = QString(
608 QObject::tr("Unable to parse container.xml file.\nLine: %1 Column %2 - %3"))
609 .arg(container.lineNumber())
610 .arg(container.columnNumber())
611 .arg(container.errorString());
612 throw (EPUBLoadParseError(error.toStdString()));
613 }
614
615 if (num_opf > 1) {
616 Utility::DisplayStdWarningDialog(tr("This epub has multiple renditions (multiple OPF files). Editing this epub in Sigil will produce a normal single rendition epub using only the main (first) OPF file found."),"");
617 }
618
619 if (m_OPFFilePath.isEmpty() || !QFile::exists(m_OPFFilePath)) {
620 throw (EPUBLoadParseError(QString(QObject::tr("No appropriate OPF file found")).toStdString()));
621 }
622 }
623
624
ReadOPF()625 void ImportEPUB::ReadOPF()
626 {
627 QString opf_text = CleanSource::ProcessXML(PrepareOPFForReading(Utility::ReadUnicodeTextFile(m_OPFFilePath)),OEBPS_MIMETYPE);
628 QXmlStreamReader opf_reader(opf_text);
629 QString ncx_id_on_spine;
630
631 while (!opf_reader.atEnd()) {
632 opf_reader.readNext();
633
634 if (!opf_reader.isStartElement()) {
635 continue;
636 }
637
638 if (opf_reader.name() == "package") {
639 m_UniqueIdentifierId = opf_reader.attributes().value("", "unique-identifier").toString();
640 m_PackageVersion = opf_reader.attributes().value("", "version").toString();
641 if (m_PackageVersion == "1.0") m_PackageVersion = "2.0";
642 }
643
644 else if (opf_reader.name() == "identifier") {
645 ReadIdentifierElement(&opf_reader);
646 }
647
648 // epub3 look for linked metadata resources that are included inside the epub
649 // but that are not and must not be included in the manifest
650 else if (opf_reader.name() == "link") {
651 ReadMetadataLinkElement(&opf_reader);
652 }
653
654 // Get the list of content files that
655 // make up the publication
656 else if (opf_reader.name() == "item") {
657 ReadManifestItemElement(&opf_reader);
658 }
659
660 // We read this just to get the NCX id
661 else if (opf_reader.name() == "spine") {
662 ncx_id_on_spine = opf_reader.attributes().value("", "toc").toString();
663 }
664
665 else if (opf_reader.name() == "itemref") {
666 m_HasSpineItems = true;
667 }
668 }
669
670 if (opf_reader.hasError()) {
671 const QString error = QString(QObject::tr("Unable to read OPF file.\nLine: %1 Column %2 - %3"))
672 .arg(opf_reader.lineNumber())
673 .arg(opf_reader.columnNumber())
674 .arg(opf_reader.errorString());
675 throw (EPUBLoadParseError(error.toStdString()));
676 }
677
678
679 //Important! The OPF Resource in the new book must be created now before adding to it in any way
680 QString bookpath;
681 bookpath = m_OPFFilePath.right(m_OPFFilePath.length() - m_ExtractedFolderPath.length() - 1);
682 m_Book->GetFolderKeeper()->AddOPFToFolder(m_PackageVersion, bookpath);
683
684 // Ensure we have an NCX available
685 LocateOrCreateNCX(ncx_id_on_spine);
686
687 }
688
689
ReadIdentifierElement(QXmlStreamReader * opf_reader)690 void ImportEPUB::ReadIdentifierElement(QXmlStreamReader *opf_reader)
691 {
692 QString id = opf_reader->attributes().value("", "id").toString();
693 QString scheme = opf_reader->attributes().value("", "scheme").toString();
694 QString value = opf_reader->readElementText();
695
696 if (id == m_UniqueIdentifierId) {
697 m_UniqueIdentifierValue = value;
698 }
699
700 if (m_UuidIdentifierValue.isEmpty() &&
701 (value.contains("urn:uuid:") || scheme.toLower() == "uuid")) {
702 m_UuidIdentifierValue = value;
703 }
704 }
705
ReadMetadataLinkElement(QXmlStreamReader * opf_reader)706 void ImportEPUB::ReadMetadataLinkElement(QXmlStreamReader *opf_reader)
707 {
708 QString relation = opf_reader->attributes().value("", "rel").toString();
709 QString mtype = opf_reader->attributes().value("", "media-type").toString();
710 QString props = opf_reader->attributes().value("", "properties").toString();
711 QString href = opf_reader->attributes().value("", "href").toString();
712 if (!href.isEmpty()) {
713 QUrl url = QUrl(href);
714 if (url.isRelative()) {
715 // we have a local unmanifested metadata file to handle
716 // attempt to map deprecated record types into proper media-types
717 if (relation == "marc21xml-record") {
718 mtype = "application/marcxml+xml";
719 }
720 else if (relation == "mods-record") {
721 mtype = "application/mods+xml";
722 }
723 else if (relation == "onix-record") {
724 mtype = "application/xml;onix";
725 }
726 else if (relation == "xmp-record") {
727 mtype = "application/xml;xmp";
728 }
729 else if (relation == "record") {
730 if (props == "onix") mtype = "application/xml;onix";
731 if (props == "xmp") mtype = "application/xml;xmp";
732 }
733 QDir opf_dir = QFileInfo(m_OPFFilePath).dir();
734 QString path = opf_dir.absolutePath() + "/" + url.path();
735 if (QFile::exists(path)) {
736 QString id = Utility::CreateUUID();
737 m_Files[ id ] = opf_dir.relativeFilePath(path);
738 m_FileMimetypes[ id ] = mtype;
739 }
740 }
741 }
742 }
743
ReadManifestItemElement(QXmlStreamReader * opf_reader)744 void ImportEPUB::ReadManifestItemElement(QXmlStreamReader *opf_reader)
745 {
746 QString id = opf_reader->attributes().value("", "id").toString();
747 QString href = opf_reader->attributes().value("", "href").toString();
748 QString type = opf_reader->attributes().value("", "media-type").toString();
749 QString properties = opf_reader->attributes().value("", "properties").toString();
750 // FIXME: can epub3 OPF Manifest href attributes include fragments?
751 // FIXME: under epub2 fragments are explicitly outlawed in spec
752 // For robustness sake we will assume they can but ...
753 // Note: Under epub3 they can point outside the epub so need to handle full url
754
755 QString apath;
756 if (href.indexOf(':') == -1) {
757 // we know we have a relative href to a file so no fragments can exist
758 apath = Utility::URLDecodePath(href);
759 }
760 // for hrefs pointing outside the epub, apath will be empty
761 // qDebug() << "ImportEpub with Manifest item: " << href << apath;
762 QString extension = QFileInfo(apath).suffix().toLower();
763
764 // validate the media type if we can, and warn otherwise
765 QString group = MediaTypes::instance()->GetGroupFromMediaType(type,"");
766 QString ext_mtype = MediaTypes::instance()->GetMediaTypeFromExtension(extension, "");
767 if (type.isEmpty() || group.isEmpty()) {
768 const QString load_warning = QObject::tr("The OPF uses an unrecognized media type \"%1\" for file \"%2\"").arg(type).arg(QFileInfo(apath).fileName()) +
769 " - " + QObject::tr("A temporary media type of \"%1\" has been assigned. You should edit your OPF file to fix this problem.").arg(ext_mtype);
770 AddLoadWarning(load_warning);
771 }
772
773 if (!apath.isEmpty()) {
774
775 // find the epub root relative file path from the opf location and the item href
776 QString file_path = m_opfDir.absolutePath() + "/" + apath;
777 file_path = Utility::resolveRelativeSegmentsInFilePath(file_path,"/");
778 file_path = file_path.remove(0, m_ExtractedFolderPath.length() + 1);
779
780 // Manifest Items may *NOT* live in the META-INF and the mimetype file should NOT be manifested
781 if (file_path.startsWith("META-INF/") || (file_path == "mimetype")) {
782 const QString load_warning = QObject::tr("The OPF has an illegal Manifest entry for a file inside the META-INF folder for file \"%1\"").arg(QFileInfo(file_path).fileName()) +
783 " - " + QObject::tr("You should edit your OPF file to remove this entry.");
784 AddLoadWarning(load_warning);
785 return;
786 }
787
788 if (type != NCX_MIMETYPE && extension != NCX_EXTENSION) {
789 if (!m_ManifestFilePaths.contains(file_path)) {
790 if (m_Files.contains(id)) {
791 // We have an error situation with a duplicate id in the epub.
792 // We must warn the user, but attempt to use another id so the epub can still be loaded.
793 QString base_id = QFileInfo(apath).fileName();
794 QString new_id(base_id);
795 int duplicate_index = 0;
796
797 while (m_Files.contains(new_id)) {
798 duplicate_index++;
799 new_id = QString("%1%2").arg(base_id).arg(duplicate_index);
800 }
801
802 const QString load_warning = QObject::tr("The OPF manifest contains duplicate ids for: %1").arg(id) +
803 " - " + QObject::tr("A temporary id has been assigned to load this EPUB. You should edit your OPF file to remove the duplication.");
804 id = new_id;
805 AddLoadWarning(load_warning);
806 }
807
808 m_Files[ id ] = apath;
809 m_FileMimetypes[ id ] = type;
810 m_ManifestFilePaths << file_path;
811 m_ManifestMediaTypes << type;
812
813 // store information about any nav document
814 if (properties.contains("nav")) {
815 m_NavId = id;
816 m_NavHref = apath;
817 }
818 }
819 } else {
820 m_NcxCandidates[ id ] = apath;
821 m_ManifestFilePaths << file_path;
822 m_ManifestMediaTypes << type;
823 }
824 }
825 }
826
827
LocateOrCreateNCX(const QString & ncx_id_on_spine)828 void ImportEPUB::LocateOrCreateNCX(const QString &ncx_id_on_spine)
829 {
830 QString load_warning;
831 QString ncx_href = "";
832 m_NCXId = ncx_id_on_spine;
833
834 // handle the normal/proper case of an ncx id on the spine matching an ncx candidate that exists
835 if (!m_NCXId.isEmpty() && m_NcxCandidates.contains(m_NCXId)) {
836 QString bookpath;
837 ncx_href = m_NcxCandidates[ m_NCXId ];
838 m_NCXFilePath = QFileInfo(m_OPFFilePath).absolutePath() % "/" % ncx_href;
839 m_NCXFilePath = Utility::resolveRelativeSegmentsInFilePath(m_NCXFilePath, "/");
840 bookpath = m_NCXFilePath.right(m_NCXFilePath.length() - m_ExtractedFolderPath.length() - 1);
841 m_Book->GetFolderKeeper()->AddNCXToFolder(m_PackageVersion, bookpath);
842 m_NCXNotInManifest = false;
843 return;
844 }
845
846 bool found = false;
847
848 // now handle ncx not specified in spine but file with ncx extension exists in manifest
849 // Search for the ncx in the manifest by looking for files with
850 // a .ncx extension.
851 if (m_NCXId.isEmpty()) {
852
853 QHashIterator<QString, QString> ncxSearch(m_NcxCandidates);
854 while (ncxSearch.hasNext()) {
855 ncxSearch.next();
856
857 if (QFileInfo(ncxSearch.value()).suffix().toLower() == NCX_EXTENSION) {
858 // we found a file with an ncx extension
859 m_NCXId = ncxSearch.key();
860 found = true;
861 break;
862 }
863 }
864 }
865
866 if (found) {
867 // m_NCXId has been properly set
868 ncx_href = m_NcxCandidates[ m_NCXId ];
869 m_NCXFilePath = QFileInfo(m_OPFFilePath).absolutePath() % "/" % ncx_href;
870 m_NCXFilePath = Utility::resolveRelativeSegmentsInFilePath(m_NCXFilePath, "/");
871
872 QString bookpath = m_NCXFilePath.right(m_NCXFilePath.length() - m_ExtractedFolderPath.length() - 1);
873 m_Book->GetFolderKeeper()->AddNCXToFolder(m_PackageVersion, bookpath);
874 m_NCXNotInManifest = false;
875 load_warning = QObject::tr("The OPF file did not identify the NCX file correctly.") + "\n" +
876 " - " + QObject::tr("Sigil has used the following file as the NCX:") +
877 QString(" %1").arg(m_NcxCandidates[ m_NCXId ]);
878
879 AddLoadWarning(load_warning);
880 return;
881 }
882
883 // An NCX is only required in epub2 so punt here if epub3
884 if ( m_PackageVersion.startsWith('3') ) return;
885
886 // epub2 only here
887
888 // If we reached here there is no file with an ncx file extension in the manifest
889 // There might be a file with an ncx extension inside the epub zip folder but
890 // since it was unmanifested, we will not use it anyway.
891 // So we need to create a new one and thereby handle the following
892 // failure conditions:
893 // - ncx specified in spine, but no matching manifest item entry
894 // - ncx file not physically present
895 // - ncx not in spine or manifest item
896
897 m_NCXNotInManifest = true;
898
899 load_warning = QObject::tr("The OPF file does not contain an NCX file.") + "\n" +
900 " - " + QObject::tr("Sigil has created a new one for you.");
901
902 m_NCXFilePath = QFileInfo(m_OPFFilePath).absolutePath() + "/toc.ncx";
903
904 // Create a new file for the NCX in the *Extracted Folder* Path
905 // We are relying on an identifier being set from the metadata.
906 // It might not have one if the book does not have the urn:uuid: format.
907 NCXResource ncx_resource(m_ExtractedFolderPath, m_NCXFilePath, m_PackageVersion, NULL);
908 ncx_resource.SetEpubVersion(m_PackageVersion);
909 // put it beside the OPF file
910 ncx_resource.FillWithDefaultText(m_PackageVersion, QFileInfo(m_OPFFilePath).absolutePath());
911 if (!m_UuidIdentifierValue.isEmpty()) {
912 ncx_resource.SetMainID(m_UuidIdentifierValue);
913 }
914 ncx_resource.SaveToDisk();
915
916 // now add the NCX to our folder
917 QString bookpath = m_NCXFilePath.right(m_NCXFilePath.length() - m_ExtractedFolderPath.length() - 1);
918 m_Book->GetFolderKeeper()->AddNCXToFolder(m_PackageVersion, bookpath);
919
920 if (!load_warning.isEmpty()) {
921 AddLoadWarning(load_warning);
922 }
923 }
924
925
LoadInfrastructureFiles()926 void ImportEPUB::LoadInfrastructureFiles()
927 {
928 // always SetEpubVersion before SetText in OPF as SetText will validate with it
929 m_Book->GetOPF()->SetEpubVersion(m_PackageVersion);
930 m_Book->GetOPF()->SetText(CleanSource::ProcessXML(PrepareOPFForReading(Utility::ReadUnicodeTextFile(m_OPFFilePath)),OEBPS_MIMETYPE));
931 QString OPFBookRelPath = m_OPFFilePath;
932 OPFBookRelPath = OPFBookRelPath.remove(0,m_ExtractedFolderPath.length()+1);
933 m_Book->GetOPF()->SetCurrentBookRelPath(OPFBookRelPath);
934 NCXResource * ncxresource = m_Book->GetNCX();
935 if (ncxresource) {
936 ncxresource->SetEpubVersion(m_PackageVersion);
937 ncxresource->SetText(CleanSource::ProcessXML(Utility::ReadUnicodeTextFile(m_NCXFilePath),"application/x-dtbncx+xml"));
938 QString NCXBookRelPath = m_NCXFilePath;
939 NCXBookRelPath = NCXBookRelPath.remove(0,m_ExtractedFolderPath.length()+1);
940 ncxresource->SetCurrentBookRelPath(NCXBookRelPath);
941 }
942 }
943
944
LoadFolderStructure()945 bool ImportEPUB::LoadFolderStructure()
946 {
947 QList<QString> keys = m_Files.keys();
948 int num_files = keys.count();
949 bool success = true;
950
951 QFutureSynchronizer<std::tuple<QString, QString>> sync;
952
953 for (int i = 0; i < num_files; ++i) {
954 QString id = keys.at(i);
955 sync.addFuture(QtConcurrent::run(
956 this,
957 &ImportEPUB::LoadOneFile,
958 m_Files.value(id),
959 m_FileMimetypes.value(id)));
960 }
961
962 sync.waitForFinished();
963 QList<QFuture<std::tuple<QString, QString>>> futures = sync.futures();
964 int num_futures = futures.count();
965
966 for (int i = 0; i < num_futures; ++i) {
967 std::tuple<QString, QString> result = futures.at(i).result();
968 if (std::get<0>(result) != std::get<1>(result)) {
969 qDebug() << "LoadFolderStructure Issue: " << std::get<0>(result) << std::get<1>(result);
970 success = false;
971 }
972 }
973
974 return success;
975 }
976
977
LoadOneFile(const QString & path,const QString & mimetype)978 std::tuple<QString, QString> ImportEPUB::LoadOneFile(const QString &path, const QString &mimetype)
979 {
980 // Use opf relative href to create the book path (currentpath) for this file
981 QString fullfilepath = QDir::cleanPath(QFileInfo(m_OPFFilePath).absolutePath() + "/" + path);
982 QString currentpath = fullfilepath;
983 currentpath = currentpath.remove(0,m_ExtractedFolderPath.length()+1);
984 try {
985 QString bookpath = currentpath;
986 Resource *resource = m_Book->GetFolderKeeper()->AddContentFileToFolder(fullfilepath, false, mimetype, bookpath);
987 if (path == m_NavHref) {
988 m_NavResource = resource;
989 }
990 QString newpath = resource->GetRelativePath();
991 return std::make_tuple(currentpath, newpath);
992 } catch (FileDoesNotExist&) {
993 return std::make_tuple(UPDATE_ERROR_STRING, UPDATE_ERROR_STRING);
994 }
995 }
996
997
PrepareOPFForReading(const QString & source)998 QString ImportEPUB::PrepareOPFForReading(const QString &source)
999 {
1000 QString source_copy(source);
1001 QString prefix = source_copy.left(XML_DECLARATION_SEARCH_PREFIX_SIZE);
1002 QRegularExpression version(VERSION_ATTRIBUTE);
1003 QRegularExpressionMatch mo = version.match(prefix);
1004 if (mo.hasMatch()) {
1005 // MASSIVE hack for XML 1.1 "support";
1006 // this is only for people who specify
1007 // XML 1.1 when they actually only use XML 1.0
1008 source_copy.replace(mo.capturedStart(1), mo.capturedLength(1), "1.0");
1009 }
1010 return source_copy;
1011 }
1012