1 /************************************************************************
2 **
3 **  Copyright (C) 2018-2019 Kevin B. Hendricks, Stratford Ontario Canada
4 **  Copyright (C) 2012      John Schember <john@nachtimwald.com>
5 **  Copyright (C) 2009-2011 Strahinja Markovic  <strahinja.markovic@gmail.com>
6 **
7 **  This file is part of Sigil.
8 **
9 **  Sigil is free software: you can redistribute it and/or modify
10 **  it under the terms of the GNU General Public License as published by
11 **  the Free Software Foundation, either version 3 of the License, or
12 **  (at your option) any later version.
13 **
14 **  Sigil is distributed in the hope that it will be useful,
15 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
16 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 **  GNU General Public License for more details.
18 **
19 **  You should have received a copy of the GNU General Public License
20 **  along with Sigil.  If not, see <http://www.gnu.org/licenses/>.
21 **
22 *************************************************************************/
23 
24 #pragma once
25 #ifndef IMPORTEPUB_H
26 #define IMPORTEPUB_H
27 
28 #include <QCoreApplication>
29 #include <QtCore/QHash>
30 #include <QtCore/QSet>
31 #include <QtCore/QStringList>
32 
33 #include "Importers/Importer.h"
34 #include "Misc/TempFolder.h"
35 
36 class HTMLResource;
37 class CSSResource;
38 class QXmlStreamReader;
39 
40 class ImportEPUB : public Importer
41 {
42     Q_DECLARE_TR_FUNCTIONS(ImportEPUB)
43 
44 public:
45     // The parameter is the file to be imported
46     ImportEPUB(const QString &fullfilepath);
47 
48     // Reads and parses the file
49     // and returns the created Book
50     virtual QSharedPointer<Book> GetBook(bool extract_metaata=true);
51 
52 private:
53     /**
54      * Extracts the EPUB file to a temporary folder.
55      * The path to the the temp folder with the extracted files
56      * is stored in m_ExtractedFolderPath.
57      */
58     void ExtractContainer();
59 
60     /**
61      * Locates the OPF file in the extracted folder.
62      * The path to the OPF is then stored in m_OPFFilePath.
63      */
64     void LocateOPF();
65 
66     /**
67      * Parses the OPF file and stores the parsed information
68      * inside m_MetaElements, m_Files and m_ReadingOrderIds
69      */
70     void ReadOPF();
71 
72     /**
73      * Reads an <identifier> element.
74      *
75      * @param opf_reader The OPF reader positioned to read
76      *                   the required element type.
77      */
78     void ReadIdentifierElement(QXmlStreamReader *opf_reader);
79 
80     /**
81      * Reads a metadata <link> element.
82      *
83      * @param opf_reader The OPF reader positioned to read
84      *                   the required element type.
85      */
86     void ReadMetadataLinkElement(QXmlStreamReader *opf_reader);
87 
88     /**
89      * Reads a manifest <item> element.
90      *
91      * @param opf_reader The OPF reader positioned to read
92      *                   the required element type.
93      */
94     void ReadManifestItemElement(QXmlStreamReader *opf_reader);
95 
96     /**
97      * Locate or create an NCX file if missing or not correctly specified.
98      *
99      * @param ncx_id_on_spine - The toc attribute value from the <spine>
100      */
101     void LocateOrCreateNCX(const QString &ncx_id_on_spine);
102 
103     /**
104      * Loads the book's infrastructure files, like
105      * the NCX and the OPF.
106      */
107     void LoadInfrastructureFiles();
108 
109     /**
110      * Loads the referenced files into the main folder of the book.
111      *
112      * @return success
113      */
114     bool LoadFolderStructure();
115 
116     /**
117      * Loads a single file.
118      *
119      * @param path A full path to the file to load.
120      * @param mimetype The mimetype of the file to load.
121      * @return A tuple where the first member is the old path to the file,
122      *         and the new member is the new, OEBPS-relative path to it.
123      */
124     std::tuple<QString, QString> LoadOneFile(const QString &path,
125                                         const QString &mimetype = QString());
126 
127     /**
128      * Performs the necessary modifications to the OPF
129      * source so that it can be read.
130      *
131      * @param source The XML source of the OPF.
132      * @return The prepared source.
133      */
134     QString PrepareOPFForReading(const QString &source);
135 
136     /**
137      * Parses the "encryption.xml" file in the META-INF folder.
138      * We return the list of file paths and the algorithms used
139      * to encrypt them.
140      *
141      * @return The list of encrypted fsiles. The keys are the
142      *         absolute paths to the files and the values are the
143      *         encryption algorithm IDs.
144      */
145     QHash<QString, QString> ParseEncryptionXml();
146 
147     bool BookContentEncrypted(const QHash<QString, QString> &encrypted_files);
148 
149     void AddObfuscatedButUndeclaredFonts(const QHash<QString, QString> &encrypted_files);
150 
151     /**
152      * Another workaround function to handle com.apple.ibooks.display-options.xml
153      * and any future non-standard Apple xml. If additional files need to
154      * be excluded from being handled as proper ePub content, you will also
155      * need to alter FILE_EXCEPTIONS at the top of FolderKeeper.cpp so the regex
156      * will detect them.
157      * This is not added to the manifest, but epubcheck uses a similar exception
158      * and accepts ePubs containing an unmanifested file of this name.
159      */
160     void AddNonStandardAppleXML();
161 
162     void ProcessFontFiles(const QList<Resource *> &resources,
163                           const QHash<QString, QString> &encrypted_files);
164 
165     /**
166      * The main temp folder where files are stored.
167      */
168     TempFolder m_TempFolder;
169 
170     /**
171      * The full path to the folder where the
172      * EPUB was extracted to.
173      */
174     QString m_ExtractedFolderPath;
175 
176     /**
177      * The full path to the OPF file
178      * of the publication.
179      */
180     QString m_OPFFilePath;
181 
182     /**
183      * The full path to the NCX file
184      * of the publication.
185      */
186     QString m_NCXFilePath;
187 
188     /**
189      * The map of all the files in the publication's
190      * manifest; The keys are the element ID's,
191      * the values are stored paths to the files.
192      */
193     QMap<QString, QString> m_Files;
194 
195     /**
196      * The map of all files in the publication's manifest;
197      * The keys are the element ID's, the vaules are the
198      * mimetype of the file.
199      */
200     QMap<QString, QString> m_FileMimetypes;
201 
202     /**
203      * InDesign likes listing several files multiple times in the manifest,
204      * even though that's explicitly forbidden by the spec. So we use this
205      * to make sure we don't load such files multiple times.
206      */
207     QStringList m_ManifestFilePaths;
208     QStringList m_ManifestMediaTypes;
209 
210     QSet<QString> m_ZipFilePaths;
211 
212     QDir m_opfDir;
213 
214     /**
215      * The identifier of the book's unique identifier.
216      */
217     QString m_UniqueIdentifierId;
218 
219     /**
220      * The value of the book's unique identifier.
221      */
222     QString m_UniqueIdentifierValue;
223 
224     /**
225      * The value of the book's first UUID-based identifier.
226      */
227     QString m_UuidIdentifierValue;
228 
229     /**
230      * It's theoretically possible (although unlikely) that an epub
231      * will have more than one file listed in the OPF manifest with
232      * an NCX mimetype. Only one of them will be the actual NCX though.
233      * This hash stores all the candidates, as an ID-to-href mapping.
234      */
235     QHash<QString, QString> m_NcxCandidates;
236 
237     bool m_HasSpineItems;
238     bool m_NCXNotInManifest;
239     QString m_NCXId;
240     QString m_NavId;
241     QString m_NavHref;
242     Resource * m_NavResource;
243 
244     /**
245      * The value of the opf package version tag
246      */
247     QString m_PackageVersion;
248 
249 };
250 
251 #endif // IMPORTEPUB_H
252