1 #include "../include/epubfmt.h"
2 #include "../include/crlog.h"
3
4
5 class EpubItem {
6 public:
7 lString32 href;
8 lString32 mediaType;
9 lString32 id;
10 lString32 title;
11 bool nonlinear;
EpubItem()12 EpubItem()
13 { }
EpubItem(const EpubItem & v)14 EpubItem( const EpubItem & v )
15 : href(v.href), mediaType(v.mediaType), id(v.id)
16 { }
operator =(const EpubItem & v)17 EpubItem & operator = ( const EpubItem & v )
18 {
19 href = v.href;
20 mediaType = v.mediaType;
21 id = v.id;
22 return *this;
23 }
24 };
25
26 class EpubItems : public LVPtrVector<EpubItem> {
27 public:
findById(const lString32 & id)28 EpubItem * findById( const lString32 & id )
29 {
30 if ( id.empty() )
31 return NULL;
32 for ( int i=0; i<length(); i++ )
33 if ( get(i)->id == id )
34 return get(i);
35 return NULL;
36 }
37 };
38
39 //static void dumpZip( LVContainerRef arc ) {
40 // lString32 arcName = LVExtractFilenameWithoutExtension( arc->GetName() );
41 // if ( arcName.empty() )
42 // arcName = "unziparc";
43 // lString32 outDir = cs32("/tmp/") + arcName;
44 // LVCreateDirectory(outDir);
45 // for ( int i=0; i<arc->GetObjectCount(); i++ ) {
46 // const LVContainerItemInfo * info = arc->GetObjectInfo(i);
47 // if ( !info->IsContainer() ) {
48 // lString32 outFileName = outDir + "/" + info->GetName();
49 // LVCreateDirectory(LVExtractPath(outFileName));
50 // LVStreamRef in = arc->OpenStream(info->GetName(), LVOM_READ);
51 // LVStreamRef out = LVOpenFileStream(outFileName.c_str(), LVOM_WRITE);
52 // if ( !in.isNull() && !out.isNull() ) {
53 // CRLog::trace("Writing %s", LCSTR(outFileName));
54 // LVPumpStream(out.get(), in.get());
55 // }
56 // }
57 // }
58 //}
59
DetectEpubFormat(LVStreamRef stream)60 bool DetectEpubFormat( LVStreamRef stream )
61 {
62
63
64 LVContainerRef m_arc = LVOpenArchieve( stream );
65 if ( m_arc.isNull() )
66 return false; // not a ZIP archive
67
68 //dumpZip( m_arc );
69
70 // read "mimetype" file contents from root of archive
71 lString32 mimeType;
72 {
73 LVStreamRef mtStream = m_arc->OpenStream(U"mimetype", LVOM_READ );
74 if ( !mtStream.isNull() ) {
75 lvsize_t size = mtStream->GetSize();
76 if ( size>4 && size<100 ) {
77 LVArray<char> buf( size+1, '\0' );
78 if ( mtStream->Read( buf.get(), size, NULL )==LVERR_OK ) {
79 for ( lvsize_t i=0; i<size; i++ )
80 if ( buf[i]<32 || ((unsigned char)buf[i])>127 )
81 buf[i] = 0;
82 buf[size] = 0;
83 if ( buf[0] )
84 mimeType = Utf8ToUnicode( lString8( buf.get() ) );
85 }
86 }
87 }
88 }
89
90 if ( mimeType != U"application/epub+zip" )
91 return false;
92 return true;
93 }
94
ReadEpubNcxToc(ldomDocument * doc,ldomNode * mapRoot,LVTocItem * baseToc,ldomDocumentFragmentWriter & appender)95 void ReadEpubNcxToc( ldomDocument * doc, ldomNode * mapRoot, LVTocItem * baseToc, ldomDocumentFragmentWriter & appender ) {
96 if ( !mapRoot || !baseToc)
97 return;
98 lUInt16 navPoint_id = mapRoot->getDocument()->getElementNameIndex(U"navPoint");
99 lUInt16 navLabel_id = mapRoot->getDocument()->getElementNameIndex(U"navLabel");
100 lUInt16 content_id = mapRoot->getDocument()->getElementNameIndex(U"content");
101 lUInt16 text_id = mapRoot->getDocument()->getElementNameIndex(U"text");
102 for ( int i=0; i<EPUB_TOC_MAX_ITER; i++ ) {
103 ldomNode * navPoint = mapRoot->findChildElement(LXML_NS_ANY, navPoint_id, i);
104 if ( !navPoint )
105 break;
106 ldomNode * navLabel = navPoint->findChildElement(LXML_NS_ANY, navLabel_id, -1);
107 if ( !navLabel )
108 continue;
109 ldomNode * text = navLabel->findChildElement(LXML_NS_ANY, text_id, -1);
110 if ( !text )
111 continue;
112 ldomNode * content = navPoint->findChildElement(LXML_NS_ANY, content_id, -1);
113 if ( !content )
114 continue;
115 lString32 href = content->getAttributeValue("src");
116 lString32 title = text->getText(' ');
117 title.trimDoubleSpaces(false, false, false);
118 if ( href.empty() || title.empty() )
119 continue;
120 //CRLog::trace("TOC href before convert: %s", LCSTR(href));
121 href = DecodeHTMLUrlString(href);
122 href = appender.convertHref(href);
123 //CRLog::trace("TOC href after convert: %s", LCSTR(href));
124 if ( href.empty() || href[0]!='#' )
125 continue;
126 ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
127 if ( !target )
128 continue;
129 ldomXPointer ptr(target, 0);
130 LVTocItem * tocItem = baseToc->addChild(title, ptr, lString32::empty_str);
131 ReadEpubNcxToc( doc, navPoint, tocItem, appender );
132 }
133 }
134
ReadEpubNcxPageList(ldomDocument * doc,ldomNode * mapRoot,LVPageMap * pageMap,ldomDocumentFragmentWriter & appender)135 void ReadEpubNcxPageList( ldomDocument * doc, ldomNode * mapRoot, LVPageMap * pageMap, ldomDocumentFragmentWriter & appender ) {
136 // http://idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.4.1.2
137 // http://idpf.org/epub/a11y/techniques/techniques-20160711.html#refPackagesLatest
138 // <pageTarget id="p4" playOrder="6" type="normal" value="2">
139 // <navLabel><text>Page 8</text></navLabel>
140 // <content src="OEBPS/PL12.xhtml#page_8"/>
141 // </pageTarget>
142 // http://blog.epubbooks.com/346/marking-up-page-numbers-in-the-epub-ncx/
143 // type:value must be unique, and value can not be used as a short version of text...
144 // Also see http://kb.daisy.org/publishing/docs/navigation/pagelist.html
145 if ( !mapRoot || !pageMap)
146 return;
147 lUInt16 pageTarget_id = mapRoot->getDocument()->getElementNameIndex(U"pageTarget");
148 lUInt16 navLabel_id = mapRoot->getDocument()->getElementNameIndex(U"navLabel");
149 lUInt16 content_id = mapRoot->getDocument()->getElementNameIndex(U"content");
150 lUInt16 text_id = mapRoot->getDocument()->getElementNameIndex(U"text");
151 for ( int i=0; i<EPUB_ITEM_MAX_ITER; i++ ) {
152 ldomNode * pageTarget = mapRoot->findChildElement(LXML_NS_ANY, pageTarget_id, i);
153 if ( !pageTarget )
154 break;
155 ldomNode * navLabel = pageTarget->findChildElement(LXML_NS_ANY, navLabel_id, -1);
156 if ( !navLabel )
157 continue;
158 ldomNode * text = navLabel->findChildElement(LXML_NS_ANY, text_id, -1);
159 if ( !text )
160 continue;
161 ldomNode * content = pageTarget->findChildElement(LXML_NS_ANY, content_id, -1);
162 if ( !content )
163 continue;
164 lString32 href = content->getAttributeValue("src");
165 lString32 title = text->getText(' ');
166 title.trimDoubleSpaces(false, false, false);
167 if ( href.empty() || title.empty() )
168 continue;
169 href = DecodeHTMLUrlString(href);
170 href = appender.convertHref(href);
171 if ( href.empty() || href[0]!='#' )
172 continue;
173 ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
174 if ( !target )
175 continue;
176 ldomXPointer ptr(target, 0);
177 pageMap->addPage(title, ptr, lString32::empty_str);
178 }
179 }
180
ReadEpubNavToc(ldomDocument * doc,ldomNode * mapRoot,LVTocItem * baseToc,ldomDocumentFragmentWriter & appender)181 void ReadEpubNavToc( ldomDocument * doc, ldomNode * mapRoot, LVTocItem * baseToc, ldomDocumentFragmentWriter & appender ) {
182 // http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def
183 if ( !mapRoot || !baseToc)
184 return;
185 lUInt16 ol_id = mapRoot->getDocument()->getElementNameIndex(U"ol");
186 lUInt16 li_id = mapRoot->getDocument()->getElementNameIndex(U"li");
187 lUInt16 a_id = mapRoot->getDocument()->getElementNameIndex(U"a");
188 lUInt16 span_id = mapRoot->getDocument()->getElementNameIndex(U"span");
189 for ( int i=0; i<EPUB_TOC_MAX_ITER; i++ ) {
190 ldomNode * li = mapRoot->findChildElement(LXML_NS_ANY, li_id, i);
191 if ( !li )
192 break;
193 LVTocItem * tocItem = NULL;
194 ldomNode * a = li->findChildElement(LXML_NS_ANY, a_id, -1);
195 if ( a ) {
196 lString32 href = a->getAttributeValue("href");
197 lString32 title = a->getText(' ');
198 if ( title.empty() ) {
199 // "If the a element contains [...] that do not provide intrinsic text alternatives,
200 // it must also include a title attribute with an alternate text rendition of the
201 // link label."
202 title = a->getAttributeValue("title");
203 }
204 title.trimDoubleSpaces(false, false, false);
205 if ( !href.empty() ) {
206 href = DecodeHTMLUrlString(href);
207 href = appender.convertHref(href);
208 if ( !href.empty() && href[0]=='#' ) {
209 ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
210 if ( target ) {
211 ldomXPointer ptr(target, 0);
212 tocItem = baseToc->addChild(title, ptr, lString32::empty_str);
213 // Report xpointer to upper parent(s) that didn't have
214 // one (no <a>) - but stop before the root node
215 LVTocItem * tmp = baseToc;
216 while ( tmp && tmp->getLevel() > 0 && tmp->getXPointer().isNull() ) {
217 tmp->setXPointer(ptr);
218 tmp = tmp->getParent();
219 }
220 }
221 }
222 }
223 }
224 // "The a element may optionally be followed by an ol ordered list representing
225 // a subsidiary content level below that heading (e.g., all the subsection
226 // headings of a section). The span element must be followed by an ol ordered
227 // list: it cannot be used in "leaf" li elements."
228 ldomNode * ol = li->findChildElement( LXML_NS_ANY, ol_id, -1 );
229 if ( ol ) { // there are sub items
230 if ( !tocItem ) {
231 // Make a LVTocItem to contain sub items
232 // There can be a <span>, with no href: children will set it to its own xpointer
233 lString32 title;
234 ldomNode * span = li->findChildElement(LXML_NS_ANY, span_id, -1);
235 if ( span ) {
236 title = span->getText(' ');
237 title.trimDoubleSpaces(false, false, false);
238 }
239 // If none, let title empty
240 tocItem = baseToc->addChild(title, ldomXPointer(), lString32::empty_str);
241 }
242 ReadEpubNavToc( doc, ol, tocItem, appender );
243 }
244 }
245 }
246
ReadEpubNavPageMap(ldomDocument * doc,ldomNode * mapRoot,LVPageMap * pageMap,ldomDocumentFragmentWriter & appender)247 void ReadEpubNavPageMap( ldomDocument * doc, ldomNode * mapRoot, LVPageMap * pageMap, ldomDocumentFragmentWriter & appender ) {
248 // http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def
249 if ( !mapRoot || !pageMap)
250 return;
251 lUInt16 li_id = mapRoot->getDocument()->getElementNameIndex(U"li");
252 lUInt16 a_id = mapRoot->getDocument()->getElementNameIndex(U"a");
253 for ( int i=0; i<EPUB_ITEM_MAX_ITER; i++ ) {
254 ldomNode * li = mapRoot->findChildElement(LXML_NS_ANY, li_id, i);
255 if ( !li )
256 break;
257 ldomNode * a = li->findChildElement(LXML_NS_ANY, a_id, -1);
258 if ( a ) {
259 lString32 href = a->getAttributeValue("href");
260 lString32 title = a->getText(' ');
261 if ( title.empty() ) {
262 title = a->getAttributeValue("title");
263 }
264 title.trimDoubleSpaces(false, false, false);
265 if ( !href.empty() ) {
266 href = DecodeHTMLUrlString(href);
267 href = appender.convertHref(href);
268 if ( !href.empty() && href[0]=='#' ) {
269 ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
270 if ( target ) {
271 ldomXPointer ptr(target, 0);
272 pageMap->addPage(title, ptr, lString32::empty_str);
273 }
274 }
275 }
276 }
277 }
278 }
279
ReadEpubAdobePageMap(ldomDocument * doc,ldomNode * mapRoot,LVPageMap * pageMap,ldomDocumentFragmentWriter & appender)280 void ReadEpubAdobePageMap( ldomDocument * doc, ldomNode * mapRoot, LVPageMap * pageMap, ldomDocumentFragmentWriter & appender ) {
281 // https://wiki.mobileread.com/wiki/Adobe_Digital_Editions#Page-map
282 if ( !mapRoot || !pageMap)
283 return;
284 lUInt16 page_id = mapRoot->getDocument()->getElementNameIndex(U"page");
285 for ( int i=0; i<EPUB_ITEM_MAX_ITER; i++ ) {
286 ldomNode * page = mapRoot->findChildElement(LXML_NS_ANY, page_id, i);
287 if ( !page )
288 break;
289 lString32 href = page->getAttributeValue("href");
290 lString32 title = page->getAttributeValue("name");
291 title.trimDoubleSpaces(false, false, false);
292 if ( href.empty() || title.empty() )
293 continue;
294 href = DecodeHTMLUrlString(href);
295 href = appender.convertHref(href);
296 if ( href.empty() || href[0]!='#' )
297 continue;
298 ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
299 if ( !target )
300 continue;
301 ldomXPointer ptr(target, 0);
302 pageMap->addPage(title, ptr, lString32::empty_str);
303 }
304 }
305
EpubGetRootFilePath(LVContainerRef m_arc)306 lString32 EpubGetRootFilePath(LVContainerRef m_arc)
307 {
308 // check root media type
309 lString32 rootfilePath;
310 lString32 rootfileMediaType;
311 // read container.xml
312 {
313 LVStreamRef container_stream = m_arc->OpenStream(U"META-INF/container.xml", LVOM_READ);
314 if ( !container_stream.isNull() ) {
315 ldomDocument * doc = LVParseXMLStream( container_stream );
316 if ( doc ) {
317 ldomNode * rootfile = doc->nodeFromXPath( cs32("container/rootfiles/rootfile") );
318 if ( rootfile && rootfile->isElement() ) {
319 rootfilePath = rootfile->getAttributeValue("full-path");
320 rootfileMediaType = rootfile->getAttributeValue("media-type");
321 }
322 delete doc;
323 }
324 }
325 }
326
327 if (rootfilePath.empty() || rootfileMediaType != "application/oebps-package+xml")
328 return lString32::empty_str;
329 return rootfilePath;
330 }
331
332 /// encrypted font demangling proxy: XORs first 1024 bytes of source stream with key
333 class FontDemanglingStream : public StreamProxy {
334 LVArray<lUInt8> & _key;
335 public:
FontDemanglingStream(LVStreamRef baseStream,LVArray<lUInt8> & key)336 FontDemanglingStream(LVStreamRef baseStream, LVArray<lUInt8> & key) : StreamProxy(baseStream), _key(key) {
337 }
338
Read(void * buf,lvsize_t count,lvsize_t * nBytesRead)339 virtual lverror_t Read( void * buf, lvsize_t count, lvsize_t * nBytesRead ) {
340 lvpos_t pos = _base->GetPos();
341 lverror_t res = _base->Read(buf, count, nBytesRead);
342 if (pos < 1024 && _key.length() == 16) {
343 for (int i=0; i + pos < 1024; i++) {
344 int keyPos = (i + pos) & 15;
345 ((lUInt8*)buf)[i] ^= _key[keyPos];
346 }
347 }
348 return res;
349 }
350
351 };
352
353 class EncryptedItem {
354 public:
355 lString32 _uri;
356 lString32 _method;
EncryptedItem(lString32 uri,lString32 method)357 EncryptedItem(lString32 uri, lString32 method) : _uri(uri), _method(method) {
358
359 }
360 };
361
362 class EncryptedItemCallback {
363 public:
364 virtual void addEncryptedItem(EncryptedItem * item) = 0;
~EncryptedItemCallback()365 virtual ~EncryptedItemCallback() {}
366 };
367
368
369 class EncCallback : public LVXMLParserCallback {
370 bool insideEncryption;
371 bool insideEncryptedData;
372 bool insideEncryptionMethod;
373 bool insideCipherData;
374 bool insideCipherReference;
375 public:
376 /// called on opening tag <
OnTagOpen(const lChar32 * nsname,const lChar32 * tagname)377 virtual ldomNode * OnTagOpen( const lChar32 * nsname, const lChar32 * tagname) {
378 CR_UNUSED(nsname);
379 if (!lStr_cmp(tagname, "encryption"))
380 insideEncryption = true;
381 else if (!lStr_cmp(tagname, "EncryptedData"))
382 insideEncryptedData = true;
383 else if (!lStr_cmp(tagname, "EncryptionMethod"))
384 insideEncryptionMethod = true;
385 else if (!lStr_cmp(tagname, "CipherData"))
386 insideCipherData = true;
387 else if (!lStr_cmp(tagname, "CipherReference"))
388 insideCipherReference = true;
389 return NULL;
390 }
391 /// called on tag close
OnTagClose(const lChar32 * nsname,const lChar32 * tagname,bool self_closing_tag=false)392 virtual void OnTagClose( const lChar32 * nsname, const lChar32 * tagname, bool self_closing_tag=false ) {
393 CR_UNUSED(nsname);
394 if (!lStr_cmp(tagname, "encryption"))
395 insideEncryption = false;
396 else if (!lStr_cmp(tagname, "EncryptedData") && insideEncryptedData) {
397 if (!algorithm.empty() && !uri.empty()) {
398 _container->addEncryptedItem(new EncryptedItem(uri, algorithm));
399 }
400 insideEncryptedData = false;
401 } else if (!lStr_cmp(tagname, "EncryptionMethod"))
402 insideEncryptionMethod = false;
403 else if (!lStr_cmp(tagname, "CipherData"))
404 insideCipherData = false;
405 else if (!lStr_cmp(tagname, "CipherReference"))
406 insideCipherReference = false;
407 }
408 /// called on element attribute
OnAttribute(const lChar32 * nsname,const lChar32 * attrname,const lChar32 * attrvalue)409 virtual void OnAttribute( const lChar32 * nsname, const lChar32 * attrname, const lChar32 * attrvalue ) {
410 CR_UNUSED2(nsname, attrvalue);
411 if (!lStr_cmp(attrname, "URI") && insideCipherReference)
412 insideEncryption = false;
413 else if (!lStr_cmp(attrname, "Algorithm") && insideEncryptionMethod)
414 insideEncryptedData = false;
415 }
416 /// called on text
OnText(const lChar32 * text,int len,lUInt32 flags)417 virtual void OnText( const lChar32 * text, int len, lUInt32 flags ) {
418 CR_UNUSED3(text,len,flags);
419 }
420 /// add named BLOB data to document
OnBlob(lString32 name,const lUInt8 * data,int size)421 virtual bool OnBlob(lString32 name, const lUInt8 * data, int size) {
422 CR_UNUSED3(name,data,size);
423 return false;
424 }
425
OnStop()426 virtual void OnStop() { }
427 /// called after > of opening tag (when entering tag body)
OnTagBody()428 virtual void OnTagBody() { }
429
430 EncryptedItemCallback * _container;
431 lString32 algorithm;
432 lString32 uri;
433 /// destructor
EncCallback(EncryptedItemCallback * container)434 EncCallback(EncryptedItemCallback * container) : _container(container) {
435 insideEncryption = false;
436 insideEncryptedData = false;
437 insideEncryptionMethod = false;
438 insideCipherData = false;
439 insideCipherReference = false;
440 }
~EncCallback()441 virtual ~EncCallback() {}
442 };
443
444 class EncryptedDataContainer : public LVContainer, public EncryptedItemCallback {
445 LVContainerRef _container;
446 LVPtrVector<EncryptedItem> _list;
447 public:
EncryptedDataContainer(LVContainerRef baseContainer)448 EncryptedDataContainer(LVContainerRef baseContainer) : _container(baseContainer) {
449
450 }
451
GetParentContainer()452 virtual LVContainer * GetParentContainer() { return _container->GetParentContainer(); }
453 //virtual const LVContainerItemInfo * GetObjectInfo(const lChar32 * pname);
GetObjectInfo(int index)454 virtual const LVContainerItemInfo * GetObjectInfo(int index) { return _container->GetObjectInfo(index); }
GetObjectCount() const455 virtual int GetObjectCount() const { return _container->GetObjectCount(); }
456 /// returns object size (file size or directory entry count)
GetSize(lvsize_t * pSize)457 virtual lverror_t GetSize( lvsize_t * pSize ) { return _container->GetSize(pSize); }
458
459
OpenStream(const lChar32 * fname,lvopen_mode_t mode)460 virtual LVStreamRef OpenStream( const lChar32 * fname, lvopen_mode_t mode ) {
461
462 LVStreamRef res = _container->OpenStream(fname, mode);
463 if (res.isNull())
464 return res;
465 if (isEncryptedItem(fname))
466 return LVStreamRef(new FontDemanglingStream(res, _fontManglingKey));
467 return res;
468 }
469
470 /// returns stream/container name, may be NULL if unknown
GetName()471 virtual const lChar32 * GetName()
472 {
473 return _container->GetName();
474 }
475 /// sets stream/container name, may be not implemented for some objects
SetName(const lChar32 * name)476 virtual void SetName(const lChar32 * name)
477 {
478 _container->SetName(name);
479 }
480
481
addEncryptedItem(EncryptedItem * item)482 virtual void addEncryptedItem(EncryptedItem * item) {
483 _list.add(item);
484 }
485
findEncryptedItem(const lChar32 * name)486 EncryptedItem * findEncryptedItem(const lChar32 * name) {
487 lString32 n;
488 if (name[0] != '/' && name[0] != '\\')
489 n << "/";
490 n << name;
491 for (int i=0; i<_list.length(); i++) {
492 lString32 s = _list[i]->_uri;
493 if (s[0]!='/' && s[i]!='\\')
494 s = "/" + s;
495 if (_list[i]->_uri == s)
496 return _list[i];
497 }
498 return NULL;
499 }
500
isEncryptedItem(const lChar32 * name)501 bool isEncryptedItem(const lChar32 * name) {
502 return findEncryptedItem(name) != NULL;
503 }
504
505 LVArray<lUInt8> _fontManglingKey;
506
setManglingKey(lString32 key)507 bool setManglingKey(lString32 key) {
508 if (key.startsWith("urn:uuid:"))
509 key = key.substr(9);
510 _fontManglingKey.clear();
511 _fontManglingKey.reserve(16);
512 lUInt8 b = 0;
513 int n = 0;
514 for (int i=0; i<key.length(); i++) {
515 int d = hexDigit(key[i]);
516 if (d>=0) {
517 b = (b << 4) | d;
518 if (++n > 1) {
519 _fontManglingKey.add(b);
520 n = 0;
521 b = 0;
522 }
523 }
524 }
525 return _fontManglingKey.length() == 16;
526 }
527
hasUnsupportedEncryption()528 bool hasUnsupportedEncryption() {
529 for (int i=0; i<_list.length(); i++) {
530 lString32 method = _list[i]->_method;
531 if (method != "http://ns.adobe.com/pdf/enc#RC") {
532 CRLog::debug("unsupported encryption method: %s", LCSTR(method));
533 return true;
534 }
535 }
536 return false;
537 }
538
open()539 bool open() {
540 LVStreamRef stream = _container->OpenStream(U"META-INF/encryption.xml", LVOM_READ);
541 if (stream.isNull())
542 return false;
543 EncCallback enccallback(this);
544 LVXMLParser parser(stream, &enccallback, false, false);
545 if (!parser.Parse())
546 return false;
547 if (_list.length())
548 return true;
549 return false;
550 }
551 };
552
createEncryptedEpubWarningDocument(ldomDocument * m_doc)553 void createEncryptedEpubWarningDocument(ldomDocument * m_doc) {
554 CRLog::error("EPUB document contains encrypted items");
555 ldomDocumentWriter writer(m_doc);
556 writer.OnTagOpenNoAttr(NULL, U"body");
557 writer.OnTagOpenNoAttr(NULL, U"h3");
558 lString32 hdr("Encrypted content");
559 writer.OnText(hdr.c_str(), hdr.length(), 0);
560 writer.OnTagClose(NULL, U"h3");
561
562 writer.OnTagOpenAndClose(NULL, U"hr");
563
564 writer.OnTagOpenNoAttr(NULL, U"p");
565 lString32 txt("This document is encrypted (has DRM protection).");
566 writer.OnText(txt.c_str(), txt.length(), 0);
567 writer.OnTagClose(NULL, U"p");
568
569 writer.OnTagOpenNoAttr(NULL, U"p");
570 lString32 txt2("Cool Reader doesn't support reading of DRM protected books.");
571 writer.OnText(txt2.c_str(), txt2.length(), 0);
572 writer.OnTagClose(NULL, U"p");
573
574 writer.OnTagOpenNoAttr(NULL, U"p");
575 lString32 txt3("To read this book, please use software recommended by book seller.");
576 writer.OnText(txt3.c_str(), txt3.length(), 0);
577 writer.OnTagClose(NULL, U"p");
578
579 writer.OnTagOpenAndClose(NULL, U"hr");
580
581 writer.OnTagOpenNoAttr(NULL, U"p");
582 lString32 txt4("");
583 writer.OnText(txt4.c_str(), txt4.length(), 0);
584 writer.OnTagClose(NULL, U"p");
585
586 writer.OnTagClose(NULL, U"body");
587 }
588
GetEpubCoverpage(LVContainerRef arc)589 LVStreamRef GetEpubCoverpage(LVContainerRef arc)
590 {
591 // check root media type
592 lString32 rootfilePath = EpubGetRootFilePath(arc);
593 if ( rootfilePath.empty() )
594 return LVStreamRef();
595
596 EncryptedDataContainer * decryptor = new EncryptedDataContainer(arc);
597 if (decryptor->open()) {
598 CRLog::debug("EPUB: encrypted items detected");
599 }
600
601 LVContainerRef m_arc = LVContainerRef(decryptor);
602
603 lString32 codeBase = LVExtractPath(rootfilePath, false);
604 CRLog::trace("codeBase=%s", LCSTR(codeBase));
605
606 LVStreamRef content_stream = m_arc->OpenStream(rootfilePath.c_str(), LVOM_READ);
607 if ( content_stream.isNull() )
608 return LVStreamRef();
609
610
611 LVStreamRef coverPageImageStream;
612 // reading content stream
613 {
614 lString32 coverId;
615 ldomDocument * doc = LVParseXMLStream( content_stream );
616 if ( !doc )
617 return LVStreamRef();
618
619 for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
620 ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/meta[") << fmt::decimal(i) << "]");
621 if ( !item )
622 break;
623 lString32 name = item->getAttributeValue("name");
624 if (name == "cover") {
625 lString32 content = item->getAttributeValue("content");
626 coverId = content;
627 // We're done
628 break;
629 }
630 }
631
632 // items
633 for ( size_t i=1; i<=EPUB_ITEM_MAX_ITER; i++ ) {
634 ldomNode * item = doc->nodeFromXPath(lString32("package/manifest/item[") << fmt::decimal(i) << "]");
635 if ( !item )
636 break;
637 lString32 href = item->getAttributeValue("href");
638 lString32 id = item->getAttributeValue("id");
639 if ( !href.empty() && !id.empty() ) {
640 if (id == coverId) {
641 // coverpage file
642 href = DecodeHTMLUrlString(href);
643 lString32 coverFileName = LVCombinePaths(codeBase, href);
644 CRLog::info("EPUB coverpage file: %s", LCSTR(coverFileName));
645 coverPageImageStream = m_arc->OpenStream(coverFileName.c_str(), LVOM_READ);
646 // We're done
647 break;
648 }
649 }
650 }
651 delete doc;
652 }
653
654 return coverPageImageStream;
655 }
656
657
658 class EmbeddedFontStyleParser {
659 LVEmbeddedFontList & _fontList;
660 lString32 _basePath;
661 int _state;
662 lString8 _face;
663 lString8 islocal;
664 bool _italic;
665 bool _bold;
666 lString32 _url;
667 public:
EmbeddedFontStyleParser(LVEmbeddedFontList & fontList)668 EmbeddedFontStyleParser(LVEmbeddedFontList & fontList) : _fontList(fontList) { }
onToken(char token)669 void onToken(char token) {
670 // 4,5: font-family:
671 // 6,7: font-weight:
672 // 8,9: font-style:
673 //10,11: src:
674 // 10 11 12 13
675 // src : url (
676 //CRLog::trace("state==%d: %c ", _state, token);
677 switch (token) {
678 case ':':
679 if (_state < 2) {
680 _state = 0;
681 } else if (_state == 4 || _state == 6 || _state == 8 || _state == 10) {
682 _state++;
683 } else if (_state != 3) {
684 _state = 2;
685 }
686 break;
687 case ';':
688 if (_state < 2) {
689 _state = 0;
690 } else if (_state != 3) {
691 _state = 2;
692 }
693 break;
694 case '{':
695 if (_state == 1) {
696 _state = 2; // inside @font {
697 _face.clear();
698 _italic = false;
699 _bold = false;
700 _url.clear();
701 } else
702 _state = 3; // inside other {
703 break;
704 case '}':
705 if (_state == 2) {
706 if (!_url.empty()) {
707 // CRLog::trace("@font { face: %s; bold: %s; italic: %s; url: %s", _face.c_str(), _bold ? "yes" : "no",
708 // _italic ? "yes" : "no", LCSTR(_url));
709 if (islocal.length()==5 && _basePath.length()!=0)
710 _url = _url.substr((_basePath.length()+1), (_url.length()-_basePath.length()));
711 if (_fontList.findByUrl(_url))
712 _url=_url.append(lString32(" ")); //avoid add() replaces existing local name
713 _fontList.add(_url, _face, _bold, _italic);
714 }
715 }
716 _state = 0;
717 break;
718 case ',':
719 if (_state == 2) {
720 if (!_url.empty()) {
721 if (islocal.length() == 5 && _basePath.length()!=0) _url=(_url.substr((_basePath.length()+1),(_url.length()-_basePath.length())));
722 if (_fontList.findByUrl(_url)) _url=_url.append(lString32(" "));
723 _fontList.add(_url, _face, _bold, _italic);
724 }
725 _state = 11;
726 }
727 break;
728 case '(':
729 if (_state == 12) {
730 _state = 13;
731 } else {
732 if (_state > 3)
733 _state = 2;
734 }
735 break;
736 }
737 }
onToken(lString8 & token)738 void onToken(lString8 & token) {
739 if (token.empty())
740 return;
741 lString8 t = token;
742 token.clear();
743 //CRLog::trace("state==%d: %s", _state, t.c_str());
744 if (t == "@font-face") {
745 if (_state == 0)
746 _state = 1; // right after @font
747 return;
748 }
749 if (_state == 1)
750 _state = 0;
751 if (_state == 2) {
752 if (t == "font-family")
753 _state = 4;
754 else if (t == "font-weight")
755 _state = 6;
756 else if (t == "font-style")
757 _state = 8;
758 else if (t == "src")
759 _state = 10;
760 } else if (_state == 5) {
761 _face = t;
762 _state = 2;
763 } else if (_state == 7) {
764 if (t == "bold")
765 _bold = true;
766 _state = 2;
767 } else if (_state == 9) {
768 if (t == "italic")
769 _italic = true;
770 _state = 2;
771 } else if (_state == 11) {
772 if (t == "url") {
773 _state = 12;
774 islocal=t;
775 }
776 else if (t=="local") {
777 _state=12;
778 islocal=t;
779 }
780 else
781 _state = 2;
782 }
783 }
onQuotedText(lString8 & token)784 void onQuotedText(lString8 & token) {
785 //CRLog::trace("state==%d: \"%s\"", _state, token.c_str());
786 if (_state == 11 || _state == 13) {
787 if (!token.empty()) {
788 lString32 ltoken = Utf8ToUnicode(token);
789 if (ltoken.startsWithNoCase(lString32("res://")) || ltoken.startsWithNoCase(lString32("file://")) )
790 _url = ltoken;
791 else
792 _url = LVCombinePaths(_basePath, ltoken);
793 }
794 _state = 2;
795 } else if (_state == 5) {
796 if (!token.empty()) {
797 _face = token;
798 }
799 _state = 2;
800 }
801 token.clear();
802 }
deletecomment(lString8 css)803 lString8 deletecomment(lString8 css) {
804 int state;
805 lString8 tmp=lString8("");
806 tmp.reserve( css.length() );
807 char c;
808 state = 0;
809 for (int i=0;i<css.length();i++) {
810 c=css[i];
811 if (state == 0 ) {
812 if (c == ('/')) // ex. [/]
813 state = 1;
814 else if (c == ('\'') ) // ex. [']
815 state = 5;
816 else if (c == ('\"')) // ex. ["]
817 state = 7;
818 }
819 else if (state == 1 && c == ('*')) // ex. [/*]
820 state = 2;
821 else if (state == 1) { // ex. [<secure/_stdio.h> or 5/3]
822 tmp<<('/');
823 state = 0;
824 }
825 else if (state == 2 && c == ('*')) // ex. [/*he*]
826 state = 3;
827 else if (state == 2) // ex. [/*heh]
828 state = 2;
829 else if (state == 3 && c == ('/')) // ex. [/*heh*/]
830 state = 0;
831 else if (state == 3) // ex. [/*heh*e]
832 state = 2;
833 /* Moved up for faster normal path:
834 else if (state == 0 && c == ('\'') ) // ex. [']
835 state = 5;
836 */
837 else if (state == 5 && c == ('\\')) // ex. ['\]
838 state = 6;
839 else if (state == 6) // ex. ['\n or '\' or '\t etc.]
840 state = 5;
841 else if (state == 5 && c == ('\'') ) // ex. ['\n' or '\'' or '\t' ect.]
842 state = 0;
843 /* Moved up for faster normal path:
844 else if (state == 0 && c == ('\"')) // ex. ["]
845 state = 7;
846 */
847 else if (state == 8) // ex. ["\n or "\" or "\t ect.]
848 state = 7;
849 else if (state == 7 && c == ('\"')) // ex. ["\n" or "\"" or "\t" ect.]
850 state = 0;
851 if ((state == 0 && c != ('/')) || state == 5 || state == 6 || state == 7 || state == 8)
852 tmp<<c;
853 }
854 return tmp;
855 }
parse(lString32 basePath,const lString8 & css)856 void parse(lString32 basePath, const lString8 & css) {
857 _state = 0;
858 _basePath = basePath;
859 lString8 token;
860 char insideQuotes = 0;
861 lString8 css_ = deletecomment(css);
862 for (int i=0; i<css_.length(); i++) {
863 char ch = css_[i];
864 if (insideQuotes || _state == 13) {
865 if (ch == insideQuotes || (_state == 13 && ch == ')')) {
866 onQuotedText(token);
867 insideQuotes = 0;
868 if (_state == 13)
869 onToken(ch);
870 } else {
871 if (_state == 13 && token.empty() && (ch == '\'' || ch=='\"')) {
872 insideQuotes = ch;
873 } else if (ch != ' ' || _state != 13)
874 token << ch;
875 }
876 continue;
877 }
878 if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
879 onToken(token);
880 } else if (ch == '@' || ch=='-' || ch=='_' || ch=='.' || (ch>='a' && ch <='z') || (ch>='A' && ch <='Z') || (ch>='0' && ch <='9')) {
881 token << ch;
882 } else if (ch == ':' || ch=='{' || ch == '}' || ch=='(' || ch == ')' || ch == ';' || ch == ',') {
883 onToken(token);
884 onToken(ch);
885 } else if (ch == '\'' || ch == '\"') {
886 onToken(token);
887 insideQuotes = ch;
888 }
889 }
890 }
891 };
892
ImportEpubDocument(LVStreamRef stream,ldomDocument * m_doc,LVDocViewCallback * progressCallback,CacheLoadingCallback * formatCallback,bool metadataOnly)893 bool ImportEpubDocument( LVStreamRef stream, ldomDocument * m_doc, LVDocViewCallback * progressCallback, CacheLoadingCallback * formatCallback, bool metadataOnly )
894 {
895 LVContainerRef arc = LVOpenArchieve( stream );
896 if ( arc.isNull() )
897 return false; // not a ZIP archive
898
899 // check root media type
900 lString32 rootfilePath = EpubGetRootFilePath(arc);
901 if ( rootfilePath.empty() )
902 return false;
903
904 EncryptedDataContainer * decryptor = new EncryptedDataContainer(arc);
905 if (decryptor->open()) {
906 CRLog::debug("EPUB: encrypted items detected");
907 }
908
909 LVContainerRef m_arc = LVContainerRef(decryptor);
910
911 if (decryptor->hasUnsupportedEncryption()) {
912 // DRM!!!
913 createEncryptedEpubWarningDocument(m_doc);
914 return true;
915 }
916
917 m_doc->setContainer(m_arc);
918
919 if ( progressCallback )
920 progressCallback->OnLoadFileProgress(1);
921
922 // read content.opf
923 EpubItems epubItems;
924 //EpubItem * epubToc = NULL; //TODO
925 LVArray<EpubItem*> spineItems;
926 lString32 codeBase;
927 //lString32 css;
928
929 //
930 {
931 codeBase=LVExtractPath(rootfilePath, false);
932 CRLog::trace("codeBase=%s", LCSTR(codeBase));
933 }
934
935 LVStreamRef content_stream = m_arc->OpenStream(rootfilePath.c_str(), LVOM_READ);
936 if ( content_stream.isNull() )
937 return false;
938
939
940 bool isEpub3 = false;
941 lString32 epubVersion;
942 lString32 navHref; // epub3 TOC
943 lString32 ncxHref; // epub2 TOC
944 lString32 pageMapHref; // epub2 Adobe page-map
945 lString32 pageMapSource;
946 lString32 coverId;
947
948 LVEmbeddedFontList fontList;
949 EmbeddedFontStyleParser styleParser(fontList);
950
951 // reading content stream
952 {
953 CRLog::debug("Parsing opf");
954 ldomDocument * doc = LVParseXMLStream( content_stream );
955 if ( !doc )
956 return false;
957
958 // // for debug
959 // {
960 // LVStreamRef out = LVOpenFileStream("/tmp/content.xml", LVOM_WRITE);
961 // doc->saveToStream(out, NULL, true);
962 // }
963
964 ldomNode * package = doc->nodeFromXPath(lString32("package"));
965 if ( package ) {
966 epubVersion = package->getAttributeValue("version");
967 if ( !epubVersion.empty() && epubVersion[0] >= '3' )
968 isEpub3 = true;
969 }
970
971 CRPropRef m_doc_props = m_doc->getProps();
972 // lString32 authors = doc->textFromXPath( cs32("package/metadata/creator"));
973 lString32 title = doc->textFromXPath( cs32("package/metadata/title"));
974 lString32 language = doc->textFromXPath( cs32("package/metadata/language"));
975 lString32 description = doc->textFromXPath( cs32("package/metadata/description"));
976 pageMapSource = doc->textFromXPath( cs32("package/metadata/source"));
977 // m_doc_props->setString(DOC_PROP_AUTHORS, authors);
978 m_doc_props->setString(DOC_PROP_TITLE, title);
979 m_doc_props->setString(DOC_PROP_LANGUAGE, language);
980 m_doc_props->setString(DOC_PROP_DESCRIPTION, description);
981 m_doc_props->setHex(DOC_PROP_FILE_CRC32, stream->getcrc32());
982
983 // Return possibly multiple <dc:creator> (authors) and <dc:subject> (keywords)
984 // as a single doc_props string with values separated by \n.
985 // (these \n can be replaced on the lua side for the most appropriate display)
986 bool authors_set = false;
987 lString32 authors;
988 for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
989 ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/creator[") << fmt::decimal(i) << "]");
990 if (!item)
991 break;
992 lString32 author = item->getText().trim();
993 if (authors_set) {
994 authors << "\n" << author;
995 }
996 else {
997 authors << author;
998 authors_set = true;
999 }
1000 }
1001 m_doc_props->setString(DOC_PROP_AUTHORS, authors);
1002
1003 // There may be multiple <dc:subject> tags, which are usually used for keywords, categories
1004 bool subjects_set = false;
1005 lString32 subjects;
1006 for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
1007 ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/subject[") << fmt::decimal(i) << "]");
1008 if (!item)
1009 break;
1010 lString32 subject = item->getText().trim();
1011 if (subjects_set) {
1012 subjects << "\n" << subject;
1013 }
1014 else {
1015 subjects << subject;
1016 subjects_set = true;
1017 }
1018 }
1019 m_doc_props->setString(DOC_PROP_KEYWORDS, subjects);
1020
1021 for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
1022 ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/identifier[") << fmt::decimal(i) << "]");
1023 if (!item)
1024 break;
1025 lString32 key = item->getText().trim();
1026 if (decryptor->setManglingKey(key)) {
1027 CRLog::debug("Using font mangling key %s", LCSTR(key));
1028 break;
1029 }
1030 }
1031
1032 #if BUILD_LITE!=1
1033 // If there is a cache file, it contains the fully built DOM document
1034 // made from the multiple html fragments in the epub, and also
1035 // m_doc_props which has been serialized.
1036 // No need to do all the below work, except if we are only
1037 // requesting metadata (parsing some bits from the EPUB is still
1038 // less expensive than loading the full cache file).
1039 // We had to wait till here to do that, to not miss font mangling
1040 // key if any.
1041 if (!metadataOnly) {
1042 CRLog::debug("Trying loading from cache");
1043 if ( m_doc->openFromCache(formatCallback, progressCallback) ) {
1044 CRLog::debug("Loaded from cache");
1045 if ( progressCallback ) {
1046 progressCallback->OnLoadFileEnd( );
1047 }
1048 delete doc;
1049 return true;
1050 }
1051 CRLog::debug("Not loaded from cache, parsing epub content");
1052 }
1053 #endif
1054
1055 CRLog::info("Authors: %s Title: %s", LCSTR(authors), LCSTR(title));
1056 bool hasSeriesMeta = false;
1057 bool hasSeriesIdMeta = false;
1058 for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
1059 // If we've already got all of 'em, we're done
1060 if (hasSeriesIdMeta && !coverId.empty()) {
1061 break;
1062 }
1063
1064 ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/meta[") << fmt::decimal(i) << "]");
1065 if ( !item )
1066 break;
1067
1068 lString32 name = item->getAttributeValue("name");
1069 // Might come before or after the series stuff
1070 // (e.g., while you might think it'd come early, Calibre appends it during the Send To Device process).
1071 // Fun fact: this isn't part of *either* version of the ePub specs.
1072 // It's simply an agreed-upon convention, given how utterly terrible the actual specs are.
1073 if (coverId.empty() && name == "cover") {
1074 lString32 content = item->getAttributeValue("content");
1075 coverId = content;
1076 continue;
1077 }
1078 // Has to come before calibre:series_index
1079 if (!hasSeriesMeta && name == "calibre:series") {
1080 lString32 content = item->getAttributeValue("content");
1081 PreProcessXmlString(content, 0);
1082 m_doc_props->setString(DOC_PROP_SERIES_NAME, content);
1083 hasSeriesMeta = true;
1084 continue;
1085 }
1086 // Has to come after calibre:series
1087 if (hasSeriesMeta && name == "calibre:series_index") {
1088 lString32 content = item->getAttributeValue("content");
1089 PreProcessXmlString(content, 0);
1090 m_doc_props->setString(DOC_PROP_SERIES_NUMBER, content);
1091 hasSeriesIdMeta = true;
1092 continue;
1093 }
1094 }
1095
1096 // Fallback to the ePub 3 spec for cover-image, c.f. https://www.w3.org/publishing/epub3/epub-packages.html#sec-cover-image
1097 if (isEpub3 && coverId.empty()) {
1098 for ( size_t i=1; i<=EPUB_ITEM_MAX_ITER; i++ ) {
1099 ldomNode * item = doc->nodeFromXPath(lString32("package/manifest/item[") << fmt::decimal(i) << "]");
1100 if ( !item )
1101 break;
1102
1103 // NOTE: Yes, plural, not a typo... -_-"
1104 lString32 props = item->getAttributeValue("properties");
1105 if (!props.empty() && props == "cover-image") {
1106 lString32 id = item->getAttributeValue("id");
1107 coverId = id;
1108 // Can only be one (or none), we're done!
1109 break;
1110 }
1111 }
1112 }
1113
1114 // Fallback to ePub 3 series metadata, c.f., https://www.w3.org/publishing/epub3/epub-packages.html#sec-belongs-to-collection
1115 // Because, yes, they're less standard than Calibre's ;D. Gotta love the ePub specs...
1116 // NOTE: This doesn't include the shittier variant where apparently a collection-type refines a dc:title's id,
1117 // or something? Not in the specs, so, don't care.
1118 // c.f., the first branch in https://github.com/koreader/crengine/issues/267#issuecomment-557507150
1119 // The only similar thing buried deep in the original 3.0 specs is incredibly convoluted:
1120 // http://idpf.org/epub/30/spec/epub30-publications.html#sec-opf-dctitle
1121 // That thankfully seems to have been relegated to the past, despite title-type still supporting a collection type:
1122 // https://www.w3.org/publishing/epub32/epub-packages.html#sec-title-type
1123 if (isEpub3 && !hasSeriesMeta) {
1124 lString32 seriesId;
1125 for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
1126 ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/meta[") << fmt::decimal(i) << "]");
1127 if ( !item )
1128 break;
1129
1130 lString32 property = item->getAttributeValue("property");
1131
1132 // If we don't have a collection yet, try to find one
1133 // NOTE: The specs say that collections *MAY* be nested (i.e., a belongs-to-collection node may refine another one).
1134 // For simplicity's sake, we only honor the first belongs-to-collection node here.
1135 // If I had actual test data, I could have instead opted to specifically match on the "parent" collection,
1136 // or the most deeply nested one, depending on what made the most sense, but I don't, so, KISS ;).
1137 if (!hasSeriesMeta) {
1138 if (property == "belongs-to-collection") {
1139 lString32 content = item->getText().trim();
1140 PreProcessXmlString(content, 0);
1141 m_doc_props->setString(DOC_PROP_SERIES_NAME, content);
1142 hasSeriesMeta = true;
1143 seriesId = item->getAttributeValue("id");
1144 // Next!
1145 continue;
1146 }
1147 }
1148
1149 // If we've got a collection, check if other properties refine it...
1150 if (hasSeriesMeta) {
1151 // NOTE: We don't really handle series any differently than set, so we don't really care about this...
1152 /*
1153 if (property == "collection-type") {
1154 // Only support valid types (series or set)
1155 lString32 content = item->getText().trim();
1156 if (content == "series" || content == "set") {
1157 lString32 id = item->getAttributeValue("refines");
1158 // Strip the anchor to match against seriesId
1159 if (id.startsWith("#")) {
1160 id = id.substr(1, id.length() - 1);
1161 }
1162 if (id == seriesId) {
1163 // Next!
1164 continue;
1165 }
1166 }
1167 }
1168 */
1169 if (property == "group-position") {
1170 lString32 id = item->getAttributeValue("refines");
1171 // Strip the anchor to match against seriesId
1172 if (id.startsWith("#")) {
1173 id = id.substr(1, id.length() - 1);
1174 }
1175 // If we've got a match, that's our position in the series!
1176 if (id == seriesId) {
1177 lString32 content = item->getText().trim();
1178 PreProcessXmlString(content, 0);
1179 // NOTE: May contain decimal values (much like calibre:series_index).
1180 // c.f., https://github.com/koreader/crengine/pull/346#discussion_r436190907
1181 m_doc_props->setString(DOC_PROP_SERIES_NUMBER, content);
1182 // And we're done :)
1183 break;
1184 }
1185 }
1186 }
1187 }
1188 }
1189
1190 if (metadataOnly && coverId.empty()) {
1191 // no cover to look for, no need for more work
1192 delete doc;
1193 return true;
1194 }
1195
1196 if ( progressCallback )
1197 progressCallback->OnLoadFileProgress(2);
1198
1199 // items
1200 CRLog::debug("opf: reading items");
1201 for ( size_t i=1; i<=EPUB_ITEM_MAX_ITER; i++ ) {
1202 ldomNode * item = doc->nodeFromXPath(lString32("package/manifest/item[") << fmt::decimal(i) << "]");
1203 if ( !item )
1204 break;
1205 lString32 href = item->getAttributeValue("href");
1206 lString32 mediaType = item->getAttributeValue("media-type");
1207 lString32 id = item->getAttributeValue("id");
1208 if ( !href.empty() && !id.empty() ) {
1209 href = DecodeHTMLUrlString(href);
1210 if ( id==coverId ) {
1211 // coverpage file
1212 lString32 coverFileName = LVCombinePaths(codeBase, href);
1213 CRLog::info("EPUB coverpage file: %s", LCSTR(coverFileName));
1214 LVStreamRef stream = m_arc->OpenStream(coverFileName.c_str(), LVOM_READ);
1215 if ( !stream.isNull() ) {
1216 LVImageSourceRef img = LVCreateStreamImageSource(stream);
1217 if ( !img.isNull() ) {
1218 CRLog::info("EPUB coverpage image is correct: %d x %d", img->GetWidth(), img->GetHeight() );
1219 m_doc_props->setString(DOC_PROP_COVER_FILE, coverFileName);
1220 }
1221 }
1222 if (metadataOnly) {
1223 // coverId found, no need for more work
1224 delete doc;
1225 return true;
1226 }
1227 }
1228 EpubItem * epubItem = new EpubItem;
1229 epubItem->href = href;
1230 epubItem->id = id;
1231 epubItem->mediaType = mediaType;
1232 epubItems.add( epubItem );
1233
1234 if ( isEpub3 && navHref.empty() ) {
1235 lString32 properties = item->getAttributeValue("properties");
1236 // We met properties="nav scripted"...
1237 if ( properties == U"nav" || properties.startsWith(U"nav ")
1238 || properties.endsWith(U" nav") || properties.pos(U" nav ") >= 0 ) {
1239 navHref = href;
1240 }
1241 }
1242
1243 // // register embedded document fonts
1244 // if (mediaType == U"application/vnd.ms-opentype"
1245 // || mediaType == U"application/x-font-otf"
1246 // || mediaType == U"application/x-font-ttf") { // TODO: more media types?
1247 // // TODO:
1248 // fontList.add(codeBase + href);
1249 // }
1250 }
1251 if (mediaType == "text/css") {
1252 lString32 name = LVCombinePaths(codeBase, href);
1253 LVStreamRef cssStream = m_arc->OpenStream(name.c_str(), LVOM_READ);
1254 if (!cssStream.isNull()) {
1255 lString8 cssFile = UnicodeToUtf8(LVReadTextFile(cssStream));
1256 lString32 base = name;
1257 LVExtractLastPathElement(base);
1258 //CRLog::trace("style: %s", cssFile.c_str());
1259 styleParser.parse(base, cssFile);
1260 }
1261 // Huge CSS files may take some time being parsed, so update progress
1262 // after each one to get a chance of it being displayed at this point.
1263 if ( progressCallback )
1264 progressCallback->OnLoadFileProgress(3);
1265 }
1266 }
1267 CRLog::debug("opf: reading items done.");
1268
1269 if ( progressCallback )
1270 progressCallback->OnLoadFileProgress(4);
1271
1272 // spine == itemrefs
1273 if ( epubItems.length()>0 ) {
1274 CRLog::debug("opf: reading spine");
1275 ldomNode * spine = doc->nodeFromXPath( cs32("package/spine") );
1276 if ( spine ) {
1277
1278 // <spine toc="ncx" page-map="page-map">
1279 EpubItem * ncx = epubItems.findById( spine->getAttributeValue("toc") ); //TODO
1280 if ( ncx!=NULL )
1281 ncxHref = LVCombinePaths(codeBase, ncx->href);
1282 EpubItem * page_map = epubItems.findById( spine->getAttributeValue("page-map") );
1283 if ( page_map!=NULL )
1284 pageMapHref = LVCombinePaths(codeBase, page_map->href);
1285
1286 for ( size_t i=1; i<=EPUB_ITEM_MAX_ITER; i++ ) {
1287 ldomNode * item = doc->nodeFromXPath(lString32("package/spine/itemref[") << fmt::decimal(i) << "]");
1288 if ( !item )
1289 break;
1290 EpubItem * epubItem = epubItems.findById( item->getAttributeValue("idref") );
1291 epubItem->nonlinear = lString32(item->getAttributeValue("linear")).lowercase() == U"no";
1292 if ( epubItem ) {
1293 // TODO: add to document
1294 spineItems.add( epubItem );
1295 }
1296 }
1297 }
1298 CRLog::debug("opf: reading spine done");
1299 }
1300 delete doc;
1301 CRLog::debug("opf: closed");
1302 }
1303
1304 if ( spineItems.length()==0 )
1305 return false;
1306
1307 if (metadataOnly)
1308 return true; // no need for more work
1309
1310 if ( progressCallback )
1311 progressCallback->OnLoadFileProgress(5);
1312
1313 lUInt32 saveFlags = m_doc->getDocFlags();
1314 m_doc->setDocFlags( saveFlags );
1315 m_doc->setContainer( m_arc );
1316
1317 ldomDocumentWriter writer(m_doc);
1318 #if 0
1319 m_doc->setNodeTypes( fb2_elem_table );
1320 m_doc->setAttributeTypes( fb2_attr_table );
1321 m_doc->setNameSpaceTypes( fb2_ns_table );
1322 #endif
1323 //m_doc->setCodeBase( codeBase );
1324
1325 int fontList_nb_before_head_parsing = fontList.length();
1326 if (!fontList.empty()) {
1327 // set document font list, and register fonts
1328 m_doc->getEmbeddedFontList().set(fontList);
1329 m_doc->registerEmbeddedFonts();
1330 }
1331
1332 ldomDocumentFragmentWriter appender(&writer, cs32("body"), cs32("DocFragment"), lString32::empty_str );
1333 writer.OnStart(NULL);
1334 writer.OnTagOpenNoAttr(U"", U"body");
1335 int fragmentCount = 0;
1336 size_t spineItemsNb = spineItems.length();
1337 for ( size_t i=0; i<spineItemsNb; i++ ) {
1338 if (spineItems[i]->mediaType == "application/xhtml+xml") {
1339 lString32 name = LVCombinePaths(codeBase, spineItems[i]->href);
1340 lString32 subst = cs32("_doc_fragment_") + fmt::decimal(i);
1341 appender.addPathSubstitution( name, subst );
1342 //CRLog::trace("subst: %s => %s", LCSTR(name), LCSTR(subst));
1343 }
1344 }
1345 int lastProgressPercent = 5;
1346 for ( size_t i=0; i<spineItemsNb; i++ ) {
1347 if ( progressCallback ) {
1348 int percent = 5 + 95 * i / spineItemsNb;
1349 if ( percent > lastProgressPercent ) {
1350 progressCallback->OnLoadFileProgress(percent);
1351 lastProgressPercent = percent;
1352 }
1353 }
1354 if (spineItems[i]->mediaType == "application/xhtml+xml") {
1355 lString32 name = LVCombinePaths(codeBase, spineItems[i]->href);
1356 {
1357 CRLog::debug("Checking fragment: %s", LCSTR(name));
1358 LVStreamRef stream = m_arc->OpenStream(name.c_str(), LVOM_READ);
1359 if ( !stream.isNull() ) {
1360 appender.setCodeBase( name );
1361 lString32 base = name;
1362 LVExtractLastPathElement(base);
1363 //CRLog::trace("base: %s", LCSTR(base));
1364 //LVXMLParser
1365 LVHTMLParser parser(stream, &appender);
1366 appender.setNonLinearFlag(spineItems[i]->nonlinear);
1367 if ( parser.CheckFormat() && parser.Parse() ) {
1368 // valid
1369 fragmentCount++;
1370 lString8 headCss = appender.getHeadStyleText();
1371 //CRLog::trace("style: %s", headCss.c_str());
1372 styleParser.parse(base, headCss);
1373 } else {
1374 CRLog::error("Document type is not XML/XHTML for fragment %s", LCSTR(name));
1375 }
1376 }
1377 }
1378 }
1379 }
1380
1381 // Clear any toc items possibly added while parsing the HTML
1382 m_doc->getToc()->clear();
1383 bool has_toc = false;
1384 bool has_pagemap = false;
1385
1386 // EPUB3 documents may contain both a toc.ncx and a nav xhtml toc.
1387 // We would have preferred to read first a toc.ncx if present, as it
1388 // is more structured than nav toc (all items have a href), but it
1389 // seems Sigil includes a toc.ncx for EPUB3, but does not keep it
1390 // up-to-date, while it does for the nav toc.
1391 if ( isEpub3 && !navHref.empty() ) {
1392 // Parse toc nav if epub3
1393 // http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def
1394 navHref = LVCombinePaths(codeBase, navHref);
1395 LVStreamRef stream = m_arc->OpenStream(navHref.c_str(), LVOM_READ);
1396 lString32 codeBase = LVExtractPath( navHref );
1397 if ( codeBase.length()>0 && codeBase.lastChar()!='/' )
1398 codeBase.append(1, U'/');
1399 appender.setCodeBase(codeBase);
1400 if ( !stream.isNull() ) {
1401 ldomDocument * navDoc = LVParseXMLStream( stream );
1402 if ( navDoc!=NULL ) {
1403 // Find <nav epub:type="toc">
1404 lUInt16 nav_id = navDoc->getElementNameIndex(U"nav");
1405 ldomNode * navDocRoot = navDoc->getRootNode();
1406 ldomNode * n = navDocRoot;
1407 // Kobo falls back to other <nav type=> when no <nav type=toc> is found,
1408 // let's do the same.
1409 ldomNode * n_toc = NULL;
1410 ldomNode * n_landmarks = NULL;
1411 ldomNode * n_page_list = NULL;
1412 if (n->isElement() && n->getChildCount() > 0) {
1413 int nextChildIndex = 0;
1414 n = n->getChildNode(nextChildIndex);
1415 while (true) {
1416 // Check only the first time we met a node (nextChildIndex == 0)
1417 // and not when we get back to it from a child to process next sibling
1418 if (nextChildIndex == 0) {
1419 if ( n->isElement() && n->getNodeId() == nav_id ) {
1420 lString32 type = n->getAttributeValue("type");
1421 if ( type == U"toc") {
1422 n_toc = n;
1423 }
1424 else if ( type == U"landmarks") {
1425 n_landmarks = n;
1426 }
1427 else if ( type == U"page-list") {
1428 n_page_list = n;
1429 }
1430 }
1431 }
1432 // Process next child
1433 if (n->isElement() && nextChildIndex < n->getChildCount()) {
1434 n = n->getChildNode(nextChildIndex);
1435 nextChildIndex = 0;
1436 continue;
1437 }
1438 // No more child, get back to parent and have it process our sibling
1439 nextChildIndex = n->getNodeIndex() + 1;
1440 n = n->getParentNode();
1441 if (!n) // back to root node
1442 break;
1443 if (n == navDocRoot && nextChildIndex >= n->getChildCount())
1444 // back to this node, and done with its children
1445 break;
1446 }
1447 }
1448 if ( !n_toc ) {
1449 if ( n_landmarks ) {
1450 n_toc = n_landmarks;
1451 }
1452 else if ( n_page_list ) {
1453 n_toc = n_page_list;
1454 }
1455 }
1456 if ( n_toc ) {
1457 // "Each nav element may contain an optional heading indicating the title
1458 // of the navigation list. The heading must be one of H1...H6."
1459 // We can't do much with this heading (that would not resolve to anything),
1460 // we could just add it as a top container item for the others, which will
1461 // be useless (and bothering), so let's just ignore it.
1462 // Get its first and single <OL> child
1463 ldomNode * ol_root = n_toc->findChildElement( LXML_NS_ANY, navDoc->getElementNameIndex(U"ol"), -1 );
1464 if ( ol_root )
1465 ReadEpubNavToc( m_doc, ol_root, m_doc->getToc(), appender );
1466 }
1467 if ( n_page_list ) {
1468 ldomNode * ol_root = n_page_list->findChildElement( LXML_NS_ANY, navDoc->getElementNameIndex(U"ol"), -1 );
1469 if ( ol_root )
1470 ReadEpubNavPageMap( m_doc, ol_root, m_doc->getPageMap(), appender );
1471 }
1472 delete navDoc;
1473 }
1474 }
1475 }
1476
1477 has_toc = m_doc->getToc()->getChildCount() > 0;
1478 has_pagemap = m_doc->getPageMap()->getChildCount() > 0;
1479
1480 // For EPUB2 (or EPUB3 where no nav toc was found): read ncx toc
1481 // We may also find in the ncx a <pageList> list
1482 if ( ( !has_toc || !has_pagemap ) && !ncxHref.empty() ) {
1483 LVStreamRef stream = m_arc->OpenStream(ncxHref.c_str(), LVOM_READ);
1484 lString32 codeBase = LVExtractPath( ncxHref );
1485 if ( codeBase.length()>0 && codeBase.lastChar()!='/' )
1486 codeBase.append(1, U'/');
1487 appender.setCodeBase(codeBase);
1488 if ( !stream.isNull() ) {
1489 ldomDocument * ncxdoc = LVParseXMLStream( stream );
1490 if ( ncxdoc!=NULL ) {
1491 if ( !has_toc ) {
1492 ldomNode * navMap = ncxdoc->nodeFromXPath( cs32("ncx/navMap"));
1493 if ( navMap!=NULL )
1494 ReadEpubNcxToc( m_doc, navMap, m_doc->getToc(), appender );
1495 }
1496 // http://blog.epubbooks.com/346/marking-up-page-numbers-in-the-epub-ncx/
1497 if ( !has_pagemap ) {
1498 ldomNode * pageList = ncxdoc->nodeFromXPath( cs32("ncx/pageList"));
1499 if ( pageList!=NULL )
1500 ReadEpubNcxPageList( m_doc, pageList, m_doc->getPageMap(), appender );
1501 }
1502 delete ncxdoc;
1503 }
1504 }
1505 }
1506
1507 has_toc = m_doc->getToc()->getChildCount() > 0;
1508 has_pagemap = m_doc->getPageMap()->getChildCount() > 0;
1509
1510 // If still no TOC, fallback to using the spine, as Kobo does.
1511 if ( !has_toc ) {
1512 LVTocItem * baseToc = m_doc->getToc();
1513 for ( size_t i=0; i<spineItemsNb; i++ ) {
1514 if (spineItems[i]->mediaType == "application/xhtml+xml") {
1515 lString32 title = spineItems[i]->id; // nothing much else to use
1516 lString32 href = appender.convertHref(spineItems[i]->id);
1517 if ( href.empty() || href[0]!='#' )
1518 continue;
1519 ldomNode * target = m_doc->getNodeById(m_doc->getAttrValueIndex(href.substr(1).c_str()));
1520 if ( !target )
1521 continue;
1522 ldomXPointer ptr(target, 0);
1523 baseToc->addChild(title, ptr, lString32::empty_str);
1524 }
1525 }
1526 }
1527
1528 // If no pagemap, parse Adobe page-map if there is one
1529 // https://wiki.mobileread.com/wiki/Adobe_Digital_Editions#Page-map
1530 if ( !has_pagemap && !pageMapHref.empty() ) {
1531 LVStreamRef stream = m_arc->OpenStream(pageMapHref.c_str(), LVOM_READ);
1532 lString32 codeBase = LVExtractPath( pageMapHref );
1533 if ( codeBase.length()>0 && codeBase.lastChar()!='/' )
1534 codeBase.append(1, U'/');
1535 appender.setCodeBase(codeBase);
1536 if ( !stream.isNull() ) {
1537 ldomDocument * pagemapdoc = LVParseXMLStream( stream );
1538 if ( pagemapdoc!=NULL ) {
1539 if ( !has_pagemap ) {
1540 ldomNode * pageMap = pagemapdoc->nodeFromXPath( cs32("page-map"));
1541 if ( pageMap!=NULL )
1542 ReadEpubAdobePageMap( m_doc, pageMap, m_doc->getPageMap(), appender );
1543 }
1544 delete pagemapdoc;
1545 }
1546 }
1547 }
1548
1549 if ( m_doc->getPageMap()->getChildCount() > 0 && !pageMapSource.empty() )
1550 m_doc->getPageMap()->setSource(pageMapSource);
1551
1552 writer.OnTagClose(U"", U"body");
1553 writer.OnStop();
1554 CRLog::debug("EPUB: %d documents merged", fragmentCount);
1555
1556 if ( fontList.length() != fontList_nb_before_head_parsing ) {
1557 // New fonts met when parsing <head><style> of some DocFragments
1558 m_doc->unregisterEmbeddedFonts();
1559 // set document font list, and register fonts
1560 m_doc->getEmbeddedFontList().set(fontList);
1561 m_doc->registerEmbeddedFonts();
1562 printf("CRE: document loaded, but styles re-init needed (cause: embedded fonts)\n");
1563 m_doc->forceReinitStyles();
1564 // todo: we could avoid forceReinitStyles() when embedded fonts are disabled
1565 // (but being here is quite rare - and having embedded font disabled even more)
1566 }
1567
1568 if ( fragmentCount==0 )
1569 return false;
1570
1571 #if 0
1572 // set stylesheet
1573 //m_doc->getStyleSheet()->clear();
1574 m_doc->setStyleSheet( NULL, true );
1575 //m_doc->getStyleSheet()->parse(m_stylesheet.c_str());
1576 if ( !css.empty() && m_doc->getDocFlag(DOC_FLAG_ENABLE_INTERNAL_STYLES) ) {
1577
1578 m_doc->setStyleSheet( "p.p { text-align: justify }\n"
1579 "svg { text-align: center }\n"
1580 "i { display: inline; font-style: italic }\n"
1581 "b { display: inline; font-weight: bold }\n"
1582 "abbr { display: inline }\n"
1583 "acronym { display: inline }\n"
1584 "address { display: inline }\n"
1585 "p.title-p { hyphenate: none }\n"
1586 //abbr, acronym, address, blockquote, br, cite, code, dfn, div, em, h1, h2, h3, h4, h5, h6, kbd, p, pre, q, samp, span, strong, var
1587 , false);
1588 m_doc->setStyleSheet( UnicodeToUtf8(css).c_str(), false );
1589 //m_doc->getStyleSheet()->parse(UnicodeToUtf8(css).c_str());
1590 } else {
1591 //m_doc->getStyleSheet()->parse(m_stylesheet.c_str());
1592 //m_doc->setStyleSheet( m_stylesheet.c_str(), false );
1593 }
1594 #endif
1595 #if 0
1596 LVStreamRef out = LVOpenFileStream( U"c:\\doc.xml" , LVOM_WRITE );
1597 if ( !out.isNull() )
1598 m_doc->saveToStream( out, "utf-8" );
1599 #endif
1600
1601 // DONE!
1602 if ( progressCallback ) {
1603 progressCallback->OnLoadFileEnd( );
1604 m_doc->compact();
1605 m_doc->dumpStatistics();
1606 }
1607
1608 // save compound XML document, for testing:
1609 //m_doc->saveToStream(LVOpenFileStream("/tmp/epub_dump.xml", LVOM_WRITE), NULL, true);
1610
1611 return true;
1612
1613 }
1614