1 #include "../include/epubfmt.h"
2 #include "../include/crlog.h"
3 
4 
5 class EpubItem {
6 public:
7     lString32 href;
8     lString32 mediaType;
9     lString32 id;
10     lString32 title;
11     bool nonlinear;
EpubItem()12     EpubItem()
13     { }
EpubItem(const EpubItem & v)14     EpubItem( const EpubItem & v )
15         : href(v.href), mediaType(v.mediaType), id(v.id)
16     { }
operator =(const EpubItem & v)17     EpubItem & operator = ( const EpubItem & v )
18     {
19         href = v.href;
20         mediaType = v.mediaType;
21         id = v.id;
22         return *this;
23     }
24 };
25 
26 class EpubItems : public LVPtrVector<EpubItem> {
27 public:
findById(const lString32 & id)28     EpubItem * findById( const lString32 & id )
29     {
30         if ( id.empty() )
31             return NULL;
32         for ( int i=0; i<length(); i++ )
33             if ( get(i)->id == id )
34                 return get(i);
35         return NULL;
36     }
37 };
38 
39 //static void dumpZip( LVContainerRef arc ) {
40 //    lString32 arcName = LVExtractFilenameWithoutExtension( arc->GetName() );
41 //    if ( arcName.empty() )
42 //        arcName = "unziparc";
43 //    lString32 outDir = cs32("/tmp/") + arcName;
44 //    LVCreateDirectory(outDir);
45 //    for ( int i=0; i<arc->GetObjectCount(); i++ ) {
46 //        const LVContainerItemInfo * info = arc->GetObjectInfo(i);
47 //        if ( !info->IsContainer() ) {
48 //            lString32 outFileName = outDir + "/" + info->GetName();
49 //            LVCreateDirectory(LVExtractPath(outFileName));
50 //            LVStreamRef in = arc->OpenStream(info->GetName(), LVOM_READ);
51 //            LVStreamRef out = LVOpenFileStream(outFileName.c_str(), LVOM_WRITE);
52 //            if ( !in.isNull() && !out.isNull() ) {
53 //                CRLog::trace("Writing %s", LCSTR(outFileName));
54 //                LVPumpStream(out.get(), in.get());
55 //            }
56 //        }
57 //    }
58 //}
59 
DetectEpubFormat(LVStreamRef stream)60 bool DetectEpubFormat( LVStreamRef stream )
61 {
62 
63 
64     LVContainerRef m_arc = LVOpenArchieve( stream );
65     if ( m_arc.isNull() )
66         return false; // not a ZIP archive
67 
68     //dumpZip( m_arc );
69 
70     // read "mimetype" file contents from root of archive
71     lString32 mimeType;
72     {
73         LVStreamRef mtStream = m_arc->OpenStream(U"mimetype", LVOM_READ );
74         if ( !mtStream.isNull() ) {
75             lvsize_t size = mtStream->GetSize();
76             if ( size>4 && size<100 ) {
77                 LVArray<char> buf( size+1, '\0' );
78                 if ( mtStream->Read( buf.get(), size, NULL )==LVERR_OK ) {
79                     for ( lvsize_t i=0; i<size; i++ )
80                         if ( buf[i]<32 || ((unsigned char)buf[i])>127 )
81                             buf[i] = 0;
82                     buf[size] = 0;
83                     if ( buf[0] )
84                         mimeType = Utf8ToUnicode( lString8( buf.get() ) );
85                 }
86             }
87         }
88     }
89 
90     if ( mimeType != U"application/epub+zip" )
91         return false;
92     return true;
93 }
94 
ReadEpubNcxToc(ldomDocument * doc,ldomNode * mapRoot,LVTocItem * baseToc,ldomDocumentFragmentWriter & appender)95 void ReadEpubNcxToc( ldomDocument * doc, ldomNode * mapRoot, LVTocItem * baseToc, ldomDocumentFragmentWriter & appender ) {
96     if ( !mapRoot || !baseToc)
97         return;
98     lUInt16 navPoint_id = mapRoot->getDocument()->getElementNameIndex(U"navPoint");
99     lUInt16 navLabel_id = mapRoot->getDocument()->getElementNameIndex(U"navLabel");
100     lUInt16 content_id = mapRoot->getDocument()->getElementNameIndex(U"content");
101     lUInt16 text_id = mapRoot->getDocument()->getElementNameIndex(U"text");
102     for ( int i=0; i<EPUB_TOC_MAX_ITER; i++ ) {
103         ldomNode * navPoint = mapRoot->findChildElement(LXML_NS_ANY, navPoint_id, i);
104         if ( !navPoint )
105             break;
106         ldomNode * navLabel = navPoint->findChildElement(LXML_NS_ANY, navLabel_id, -1);
107         if ( !navLabel )
108             continue;
109         ldomNode * text = navLabel->findChildElement(LXML_NS_ANY, text_id, -1);
110         if ( !text )
111             continue;
112         ldomNode * content = navPoint->findChildElement(LXML_NS_ANY, content_id, -1);
113         if ( !content )
114             continue;
115         lString32 href = content->getAttributeValue("src");
116         lString32 title = text->getText(' ');
117         title.trimDoubleSpaces(false, false, false);
118         if ( href.empty() || title.empty() )
119             continue;
120         //CRLog::trace("TOC href before convert: %s", LCSTR(href));
121         href = DecodeHTMLUrlString(href);
122         href = appender.convertHref(href);
123         //CRLog::trace("TOC href after convert: %s", LCSTR(href));
124         if ( href.empty() || href[0]!='#' )
125             continue;
126         ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
127         if ( !target )
128             continue;
129         ldomXPointer ptr(target, 0);
130         LVTocItem * tocItem = baseToc->addChild(title, ptr, lString32::empty_str);
131         ReadEpubNcxToc( doc, navPoint, tocItem, appender );
132     }
133 }
134 
ReadEpubNcxPageList(ldomDocument * doc,ldomNode * mapRoot,LVPageMap * pageMap,ldomDocumentFragmentWriter & appender)135 void ReadEpubNcxPageList( ldomDocument * doc, ldomNode * mapRoot, LVPageMap * pageMap, ldomDocumentFragmentWriter & appender ) {
136     // http://idpf.org/epub/20/spec/OPF_2.0.1_draft.htm#Section2.4.1.2
137     // http://idpf.org/epub/a11y/techniques/techniques-20160711.html#refPackagesLatest
138     //    <pageTarget id="p4" playOrder="6" type="normal" value="2">
139     //      <navLabel><text>Page 8</text></navLabel>
140     //      <content src="OEBPS/PL12.xhtml#page_8"/>
141     //    </pageTarget>
142     // http://blog.epubbooks.com/346/marking-up-page-numbers-in-the-epub-ncx/
143     // type:value must be unique, and value can not be used as a short version of text...
144     // Also see http://kb.daisy.org/publishing/docs/navigation/pagelist.html
145     if ( !mapRoot || !pageMap)
146         return;
147     lUInt16 pageTarget_id = mapRoot->getDocument()->getElementNameIndex(U"pageTarget");
148     lUInt16 navLabel_id = mapRoot->getDocument()->getElementNameIndex(U"navLabel");
149     lUInt16 content_id = mapRoot->getDocument()->getElementNameIndex(U"content");
150     lUInt16 text_id = mapRoot->getDocument()->getElementNameIndex(U"text");
151     for ( int i=0; i<EPUB_ITEM_MAX_ITER; i++ ) {
152         ldomNode * pageTarget = mapRoot->findChildElement(LXML_NS_ANY, pageTarget_id, i);
153         if ( !pageTarget )
154             break;
155         ldomNode * navLabel = pageTarget->findChildElement(LXML_NS_ANY, navLabel_id, -1);
156         if ( !navLabel )
157             continue;
158         ldomNode * text = navLabel->findChildElement(LXML_NS_ANY, text_id, -1);
159         if ( !text )
160             continue;
161         ldomNode * content = pageTarget->findChildElement(LXML_NS_ANY, content_id, -1);
162         if ( !content )
163             continue;
164         lString32 href = content->getAttributeValue("src");
165         lString32 title = text->getText(' ');
166         title.trimDoubleSpaces(false, false, false);
167         if ( href.empty() || title.empty() )
168             continue;
169         href = DecodeHTMLUrlString(href);
170         href = appender.convertHref(href);
171         if ( href.empty() || href[0]!='#' )
172             continue;
173         ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
174         if ( !target )
175             continue;
176         ldomXPointer ptr(target, 0);
177         pageMap->addPage(title, ptr, lString32::empty_str);
178     }
179 }
180 
ReadEpubNavToc(ldomDocument * doc,ldomNode * mapRoot,LVTocItem * baseToc,ldomDocumentFragmentWriter & appender)181 void ReadEpubNavToc( ldomDocument * doc, ldomNode * mapRoot, LVTocItem * baseToc, ldomDocumentFragmentWriter & appender ) {
182     // http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def
183     if ( !mapRoot || !baseToc)
184         return;
185     lUInt16 ol_id = mapRoot->getDocument()->getElementNameIndex(U"ol");
186     lUInt16 li_id = mapRoot->getDocument()->getElementNameIndex(U"li");
187     lUInt16 a_id = mapRoot->getDocument()->getElementNameIndex(U"a");
188     lUInt16 span_id = mapRoot->getDocument()->getElementNameIndex(U"span");
189     for ( int i=0; i<EPUB_TOC_MAX_ITER; i++ ) {
190         ldomNode * li = mapRoot->findChildElement(LXML_NS_ANY, li_id, i);
191         if ( !li )
192             break;
193         LVTocItem * tocItem = NULL;
194         ldomNode * a = li->findChildElement(LXML_NS_ANY, a_id, -1);
195         if ( a ) {
196             lString32 href = a->getAttributeValue("href");
197             lString32 title = a->getText(' ');
198             if ( title.empty() ) {
199                 // "If the a element contains [...] that do not provide intrinsic text alternatives,
200                 // it must also include a title attribute with an alternate text rendition of the
201                 // link label."
202                 title = a->getAttributeValue("title");
203             }
204             title.trimDoubleSpaces(false, false, false);
205             if ( !href.empty() ) {
206                 href = DecodeHTMLUrlString(href);
207                 href = appender.convertHref(href);
208                 if ( !href.empty() && href[0]=='#' ) {
209                     ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
210                     if ( target ) {
211                         ldomXPointer ptr(target, 0);
212                         tocItem = baseToc->addChild(title, ptr, lString32::empty_str);
213                         // Report xpointer to upper parent(s) that didn't have
214                         // one (no <a>) - but stop before the root node
215                         LVTocItem * tmp = baseToc;
216                         while ( tmp && tmp->getLevel() > 0 && tmp->getXPointer().isNull() ) {
217                             tmp->setXPointer(ptr);
218                             tmp = tmp->getParent();
219                         }
220                     }
221                 }
222             }
223         }
224         // "The a element may optionally be followed by an ol ordered list representing
225         // a subsidiary content level below that heading (e.g., all the subsection
226         // headings of a section). The span element must be followed by an ol ordered
227         // list: it cannot be used in "leaf" li elements."
228         ldomNode * ol = li->findChildElement( LXML_NS_ANY, ol_id, -1 );
229         if ( ol ) { // there are sub items
230             if ( !tocItem ) {
231                 // Make a LVTocItem to contain sub items
232                 // There can be a <span>, with no href: children will set it to its own xpointer
233                 lString32 title;
234                 ldomNode * span = li->findChildElement(LXML_NS_ANY, span_id, -1);
235                 if ( span ) {
236                     title = span->getText(' ');
237                     title.trimDoubleSpaces(false, false, false);
238                 }
239                 // If none, let title empty
240                 tocItem = baseToc->addChild(title, ldomXPointer(), lString32::empty_str);
241             }
242             ReadEpubNavToc( doc, ol, tocItem, appender );
243         }
244     }
245 }
246 
ReadEpubNavPageMap(ldomDocument * doc,ldomNode * mapRoot,LVPageMap * pageMap,ldomDocumentFragmentWriter & appender)247 void ReadEpubNavPageMap( ldomDocument * doc, ldomNode * mapRoot, LVPageMap * pageMap, ldomDocumentFragmentWriter & appender ) {
248     // http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def
249     if ( !mapRoot || !pageMap)
250         return;
251     lUInt16 li_id = mapRoot->getDocument()->getElementNameIndex(U"li");
252     lUInt16 a_id = mapRoot->getDocument()->getElementNameIndex(U"a");
253     for ( int i=0; i<EPUB_ITEM_MAX_ITER; i++ ) {
254         ldomNode * li = mapRoot->findChildElement(LXML_NS_ANY, li_id, i);
255         if ( !li )
256             break;
257         ldomNode * a = li->findChildElement(LXML_NS_ANY, a_id, -1);
258         if ( a ) {
259             lString32 href = a->getAttributeValue("href");
260             lString32 title = a->getText(' ');
261             if ( title.empty() ) {
262                 title = a->getAttributeValue("title");
263             }
264             title.trimDoubleSpaces(false, false, false);
265             if ( !href.empty() ) {
266                 href = DecodeHTMLUrlString(href);
267                 href = appender.convertHref(href);
268                 if ( !href.empty() && href[0]=='#' ) {
269                     ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
270                     if ( target ) {
271                         ldomXPointer ptr(target, 0);
272                         pageMap->addPage(title, ptr, lString32::empty_str);
273                     }
274                 }
275             }
276         }
277     }
278 }
279 
ReadEpubAdobePageMap(ldomDocument * doc,ldomNode * mapRoot,LVPageMap * pageMap,ldomDocumentFragmentWriter & appender)280 void ReadEpubAdobePageMap( ldomDocument * doc, ldomNode * mapRoot, LVPageMap * pageMap, ldomDocumentFragmentWriter & appender ) {
281     // https://wiki.mobileread.com/wiki/Adobe_Digital_Editions#Page-map
282     if ( !mapRoot || !pageMap)
283         return;
284     lUInt16 page_id = mapRoot->getDocument()->getElementNameIndex(U"page");
285     for ( int i=0; i<EPUB_ITEM_MAX_ITER; i++ ) {
286         ldomNode * page = mapRoot->findChildElement(LXML_NS_ANY, page_id, i);
287         if ( !page )
288             break;
289         lString32 href = page->getAttributeValue("href");
290         lString32 title = page->getAttributeValue("name");
291         title.trimDoubleSpaces(false, false, false);
292         if ( href.empty() || title.empty() )
293             continue;
294         href = DecodeHTMLUrlString(href);
295         href = appender.convertHref(href);
296         if ( href.empty() || href[0]!='#' )
297             continue;
298         ldomNode * target = doc->getNodeById(doc->getAttrValueIndex(href.substr(1).c_str()));
299         if ( !target )
300             continue;
301         ldomXPointer ptr(target, 0);
302         pageMap->addPage(title, ptr, lString32::empty_str);
303     }
304 }
305 
EpubGetRootFilePath(LVContainerRef m_arc)306 lString32 EpubGetRootFilePath(LVContainerRef m_arc)
307 {
308     // check root media type
309     lString32 rootfilePath;
310     lString32 rootfileMediaType;
311     // read container.xml
312     {
313         LVStreamRef container_stream = m_arc->OpenStream(U"META-INF/container.xml", LVOM_READ);
314         if ( !container_stream.isNull() ) {
315             ldomDocument * doc = LVParseXMLStream( container_stream );
316             if ( doc ) {
317                 ldomNode * rootfile = doc->nodeFromXPath( cs32("container/rootfiles/rootfile") );
318                 if ( rootfile && rootfile->isElement() ) {
319                     rootfilePath = rootfile->getAttributeValue("full-path");
320                     rootfileMediaType = rootfile->getAttributeValue("media-type");
321                 }
322                 delete doc;
323             }
324         }
325     }
326 
327     if (rootfilePath.empty() || rootfileMediaType != "application/oebps-package+xml")
328         return lString32::empty_str;
329     return rootfilePath;
330 }
331 
332 /// encrypted font demangling proxy: XORs first 1024 bytes of source stream with key
333 class FontDemanglingStream : public StreamProxy {
334     LVArray<lUInt8> & _key;
335 public:
FontDemanglingStream(LVStreamRef baseStream,LVArray<lUInt8> & key)336     FontDemanglingStream(LVStreamRef baseStream, LVArray<lUInt8> & key) : StreamProxy(baseStream), _key(key) {
337     }
338 
Read(void * buf,lvsize_t count,lvsize_t * nBytesRead)339     virtual lverror_t Read( void * buf, lvsize_t count, lvsize_t * nBytesRead ) {
340         lvpos_t pos = _base->GetPos();
341         lverror_t res = _base->Read(buf, count, nBytesRead);
342         if (pos < 1024 && _key.length() == 16) {
343             for (int i=0; i + pos < 1024; i++) {
344                 int keyPos = (i + pos) & 15;
345                 ((lUInt8*)buf)[i] ^= _key[keyPos];
346             }
347         }
348         return res;
349     }
350 
351 };
352 
353 class EncryptedItem {
354 public:
355     lString32 _uri;
356     lString32 _method;
EncryptedItem(lString32 uri,lString32 method)357     EncryptedItem(lString32 uri, lString32 method) : _uri(uri), _method(method) {
358 
359     }
360 };
361 
362 class EncryptedItemCallback {
363 public:
364     virtual void addEncryptedItem(EncryptedItem * item) = 0;
~EncryptedItemCallback()365     virtual ~EncryptedItemCallback() {}
366 };
367 
368 
369 class EncCallback : public LVXMLParserCallback {
370     bool insideEncryption;
371     bool insideEncryptedData;
372     bool insideEncryptionMethod;
373     bool insideCipherData;
374     bool insideCipherReference;
375 public:
376     /// called on opening tag <
OnTagOpen(const lChar32 * nsname,const lChar32 * tagname)377     virtual ldomNode * OnTagOpen( const lChar32 * nsname, const lChar32 * tagname) {
378         CR_UNUSED(nsname);
379         if (!lStr_cmp(tagname, "encryption"))
380             insideEncryption = true;
381         else if (!lStr_cmp(tagname, "EncryptedData"))
382             insideEncryptedData = true;
383         else if (!lStr_cmp(tagname, "EncryptionMethod"))
384             insideEncryptionMethod = true;
385         else if (!lStr_cmp(tagname, "CipherData"))
386             insideCipherData = true;
387         else if (!lStr_cmp(tagname, "CipherReference"))
388             insideCipherReference = true;
389         return NULL;
390     }
391     /// called on tag close
OnTagClose(const lChar32 * nsname,const lChar32 * tagname,bool self_closing_tag=false)392     virtual void OnTagClose( const lChar32 * nsname, const lChar32 * tagname, bool self_closing_tag=false ) {
393         CR_UNUSED(nsname);
394         if (!lStr_cmp(tagname, "encryption"))
395             insideEncryption = false;
396         else if (!lStr_cmp(tagname, "EncryptedData") && insideEncryptedData) {
397             if (!algorithm.empty() && !uri.empty()) {
398                 _container->addEncryptedItem(new EncryptedItem(uri, algorithm));
399             }
400             insideEncryptedData = false;
401         } else if (!lStr_cmp(tagname, "EncryptionMethod"))
402             insideEncryptionMethod = false;
403         else if (!lStr_cmp(tagname, "CipherData"))
404             insideCipherData = false;
405         else if (!lStr_cmp(tagname, "CipherReference"))
406             insideCipherReference = false;
407     }
408     /// called on element attribute
OnAttribute(const lChar32 * nsname,const lChar32 * attrname,const lChar32 * attrvalue)409     virtual void OnAttribute( const lChar32 * nsname, const lChar32 * attrname, const lChar32 * attrvalue ) {
410         CR_UNUSED2(nsname, attrvalue);
411         if (!lStr_cmp(attrname, "URI") && insideCipherReference)
412             insideEncryption = false;
413         else if (!lStr_cmp(attrname, "Algorithm") && insideEncryptionMethod)
414             insideEncryptedData = false;
415     }
416     /// called on text
OnText(const lChar32 * text,int len,lUInt32 flags)417     virtual void OnText( const lChar32 * text, int len, lUInt32 flags ) {
418         CR_UNUSED3(text,len,flags);
419     }
420     /// add named BLOB data to document
OnBlob(lString32 name,const lUInt8 * data,int size)421     virtual bool OnBlob(lString32 name, const lUInt8 * data, int size) {
422         CR_UNUSED3(name,data,size);
423         return false;
424     }
425 
OnStop()426     virtual void OnStop() { }
427     /// called after > of opening tag (when entering tag body)
OnTagBody()428     virtual void OnTagBody() { }
429 
430     EncryptedItemCallback * _container;
431     lString32 algorithm;
432     lString32 uri;
433     /// destructor
EncCallback(EncryptedItemCallback * container)434     EncCallback(EncryptedItemCallback * container) : _container(container) {
435         insideEncryption = false;
436         insideEncryptedData = false;
437         insideEncryptionMethod = false;
438         insideCipherData = false;
439         insideCipherReference = false;
440     }
~EncCallback()441     virtual ~EncCallback() {}
442 };
443 
444 class EncryptedDataContainer : public LVContainer, public EncryptedItemCallback {
445     LVContainerRef _container;
446     LVPtrVector<EncryptedItem> _list;
447 public:
EncryptedDataContainer(LVContainerRef baseContainer)448     EncryptedDataContainer(LVContainerRef baseContainer) : _container(baseContainer) {
449 
450     }
451 
GetParentContainer()452     virtual LVContainer * GetParentContainer() { return _container->GetParentContainer(); }
453     //virtual const LVContainerItemInfo * GetObjectInfo(const lChar32 * pname);
GetObjectInfo(int index)454     virtual const LVContainerItemInfo * GetObjectInfo(int index) { return _container->GetObjectInfo(index); }
GetObjectCount() const455     virtual int GetObjectCount() const { return _container->GetObjectCount(); }
456     /// returns object size (file size or directory entry count)
GetSize(lvsize_t * pSize)457     virtual lverror_t GetSize( lvsize_t * pSize ) { return _container->GetSize(pSize); }
458 
459 
OpenStream(const lChar32 * fname,lvopen_mode_t mode)460     virtual LVStreamRef OpenStream( const lChar32 * fname, lvopen_mode_t mode ) {
461 
462         LVStreamRef res = _container->OpenStream(fname, mode);
463         if (res.isNull())
464             return res;
465         if (isEncryptedItem(fname))
466             return LVStreamRef(new FontDemanglingStream(res, _fontManglingKey));
467         return res;
468     }
469 
470     /// returns stream/container name, may be NULL if unknown
GetName()471     virtual const lChar32 * GetName()
472     {
473         return _container->GetName();
474     }
475     /// sets stream/container name, may be not implemented for some objects
SetName(const lChar32 * name)476     virtual void SetName(const lChar32 * name)
477     {
478         _container->SetName(name);
479     }
480 
481 
addEncryptedItem(EncryptedItem * item)482     virtual void addEncryptedItem(EncryptedItem * item) {
483         _list.add(item);
484     }
485 
findEncryptedItem(const lChar32 * name)486     EncryptedItem * findEncryptedItem(const lChar32 * name) {
487         lString32 n;
488         if (name[0] != '/' && name[0] != '\\')
489             n << "/";
490         n << name;
491         for (int i=0; i<_list.length(); i++) {
492             lString32 s = _list[i]->_uri;
493             if (s[0]!='/' && s[i]!='\\')
494                 s = "/" + s;
495             if (_list[i]->_uri == s)
496                 return _list[i];
497         }
498         return NULL;
499     }
500 
isEncryptedItem(const lChar32 * name)501     bool isEncryptedItem(const lChar32 * name) {
502         return findEncryptedItem(name) != NULL;
503     }
504 
505     LVArray<lUInt8> _fontManglingKey;
506 
setManglingKey(lString32 key)507     bool setManglingKey(lString32 key) {
508         if (key.startsWith("urn:uuid:"))
509             key = key.substr(9);
510         _fontManglingKey.clear();
511         _fontManglingKey.reserve(16);
512         lUInt8 b = 0;
513         int n = 0;
514         for (int i=0; i<key.length(); i++) {
515             int d = hexDigit(key[i]);
516             if (d>=0) {
517                 b = (b << 4) | d;
518                 if (++n > 1) {
519                     _fontManglingKey.add(b);
520                     n = 0;
521                     b = 0;
522                 }
523             }
524         }
525         return _fontManglingKey.length() == 16;
526     }
527 
hasUnsupportedEncryption()528     bool hasUnsupportedEncryption() {
529         for (int i=0; i<_list.length(); i++) {
530             lString32 method = _list[i]->_method;
531             if (method != "http://ns.adobe.com/pdf/enc#RC") {
532                 CRLog::debug("unsupported encryption method: %s", LCSTR(method));
533                 return true;
534             }
535         }
536         return false;
537     }
538 
open()539     bool open() {
540         LVStreamRef stream = _container->OpenStream(U"META-INF/encryption.xml", LVOM_READ);
541         if (stream.isNull())
542             return false;
543         EncCallback enccallback(this);
544         LVXMLParser parser(stream, &enccallback, false, false);
545         if (!parser.Parse())
546             return false;
547         if (_list.length())
548             return true;
549         return false;
550     }
551 };
552 
createEncryptedEpubWarningDocument(ldomDocument * m_doc)553 void createEncryptedEpubWarningDocument(ldomDocument * m_doc) {
554     CRLog::error("EPUB document contains encrypted items");
555     ldomDocumentWriter writer(m_doc);
556     writer.OnTagOpenNoAttr(NULL, U"body");
557     writer.OnTagOpenNoAttr(NULL, U"h3");
558     lString32 hdr("Encrypted content");
559     writer.OnText(hdr.c_str(), hdr.length(), 0);
560     writer.OnTagClose(NULL, U"h3");
561 
562     writer.OnTagOpenAndClose(NULL, U"hr");
563 
564     writer.OnTagOpenNoAttr(NULL, U"p");
565     lString32 txt("This document is encrypted (has DRM protection).");
566     writer.OnText(txt.c_str(), txt.length(), 0);
567     writer.OnTagClose(NULL, U"p");
568 
569     writer.OnTagOpenNoAttr(NULL, U"p");
570     lString32 txt2("Cool Reader doesn't support reading of DRM protected books.");
571     writer.OnText(txt2.c_str(), txt2.length(), 0);
572     writer.OnTagClose(NULL, U"p");
573 
574     writer.OnTagOpenNoAttr(NULL, U"p");
575     lString32 txt3("To read this book, please use software recommended by book seller.");
576     writer.OnText(txt3.c_str(), txt3.length(), 0);
577     writer.OnTagClose(NULL, U"p");
578 
579     writer.OnTagOpenAndClose(NULL, U"hr");
580 
581     writer.OnTagOpenNoAttr(NULL, U"p");
582     lString32 txt4("");
583     writer.OnText(txt4.c_str(), txt4.length(), 0);
584     writer.OnTagClose(NULL, U"p");
585 
586     writer.OnTagClose(NULL, U"body");
587 }
588 
GetEpubCoverpage(LVContainerRef arc)589 LVStreamRef GetEpubCoverpage(LVContainerRef arc)
590 {
591     // check root media type
592     lString32 rootfilePath = EpubGetRootFilePath(arc);
593     if ( rootfilePath.empty() )
594         return LVStreamRef();
595 
596     EncryptedDataContainer * decryptor = new EncryptedDataContainer(arc);
597     if (decryptor->open()) {
598         CRLog::debug("EPUB: encrypted items detected");
599     }
600 
601     LVContainerRef m_arc = LVContainerRef(decryptor);
602 
603     lString32 codeBase = LVExtractPath(rootfilePath, false);
604     CRLog::trace("codeBase=%s", LCSTR(codeBase));
605 
606     LVStreamRef content_stream = m_arc->OpenStream(rootfilePath.c_str(), LVOM_READ);
607     if ( content_stream.isNull() )
608         return LVStreamRef();
609 
610 
611     LVStreamRef coverPageImageStream;
612     // reading content stream
613     {
614         lString32 coverId;
615         ldomDocument * doc = LVParseXMLStream( content_stream );
616         if ( !doc )
617             return LVStreamRef();
618 
619         for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
620             ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/meta[") << fmt::decimal(i) << "]");
621             if ( !item )
622                 break;
623             lString32 name = item->getAttributeValue("name");
624             if (name == "cover") {
625                 lString32 content = item->getAttributeValue("content");
626                 coverId = content;
627                 // We're done
628                 break;
629             }
630         }
631 
632         // items
633         for ( size_t i=1; i<=EPUB_ITEM_MAX_ITER; i++ ) {
634             ldomNode * item = doc->nodeFromXPath(lString32("package/manifest/item[") << fmt::decimal(i) << "]");
635             if ( !item )
636                 break;
637             lString32 href = item->getAttributeValue("href");
638             lString32 id = item->getAttributeValue("id");
639             if ( !href.empty() && !id.empty() ) {
640                 if (id == coverId) {
641                     // coverpage file
642                     href = DecodeHTMLUrlString(href);
643                     lString32 coverFileName = LVCombinePaths(codeBase, href);
644                     CRLog::info("EPUB coverpage file: %s", LCSTR(coverFileName));
645                     coverPageImageStream = m_arc->OpenStream(coverFileName.c_str(), LVOM_READ);
646                     // We're done
647                     break;
648                 }
649             }
650         }
651         delete doc;
652     }
653 
654     return coverPageImageStream;
655 }
656 
657 
658 class EmbeddedFontStyleParser {
659     LVEmbeddedFontList & _fontList;
660     lString32 _basePath;
661     int _state;
662     lString8 _face;
663     lString8 islocal;
664     bool _italic;
665     bool _bold;
666     lString32 _url;
667 public:
EmbeddedFontStyleParser(LVEmbeddedFontList & fontList)668     EmbeddedFontStyleParser(LVEmbeddedFontList & fontList) : _fontList(fontList) { }
onToken(char token)669     void onToken(char token) {
670         // 4,5:  font-family:
671         // 6,7:  font-weight:
672         // 8,9:  font-style:
673         //10,11: src:
674         //   10   11    12   13
675         //   src   :   url    (
676         //CRLog::trace("state==%d: %c ", _state, token);
677         switch (token) {
678         case ':':
679             if (_state < 2) {
680                 _state = 0;
681             } else if (_state == 4 || _state == 6 || _state == 8 || _state == 10) {
682                 _state++;
683             } else if (_state != 3) {
684                 _state = 2;
685             }
686             break;
687         case ';':
688             if (_state < 2) {
689                 _state = 0;
690             } else if (_state != 3) {
691                 _state = 2;
692             }
693             break;
694         case '{':
695             if (_state == 1) {
696                 _state = 2; // inside @font {
697                 _face.clear();
698                 _italic = false;
699                 _bold = false;
700                 _url.clear();
701             } else
702                 _state = 3; // inside other {
703             break;
704         case '}':
705             if (_state == 2) {
706                 if (!_url.empty()) {
707 //                    CRLog::trace("@font { face: %s; bold: %s; italic: %s; url: %s", _face.c_str(), _bold ? "yes" : "no",
708 //                                 _italic ? "yes" : "no", LCSTR(_url));
709                     if (islocal.length()==5 && _basePath.length()!=0)
710                         _url = _url.substr((_basePath.length()+1), (_url.length()-_basePath.length()));
711                     if (_fontList.findByUrl(_url))
712                         _url=_url.append(lString32(" ")); //avoid add() replaces existing local name
713                     _fontList.add(_url, _face, _bold, _italic);
714                 }
715             }
716             _state = 0;
717             break;
718         case ',':
719             if (_state == 2) {
720                 if (!_url.empty()) {
721                       if (islocal.length() == 5 && _basePath.length()!=0) _url=(_url.substr((_basePath.length()+1),(_url.length()-_basePath.length())));
722                         if (_fontList.findByUrl(_url)) _url=_url.append(lString32(" "));
723                     _fontList.add(_url, _face, _bold, _italic);
724                 }
725                 _state = 11;
726             }
727             break;
728         case '(':
729             if (_state == 12) {
730                 _state = 13;
731             } else {
732                 if (_state > 3)
733                     _state = 2;
734             }
735             break;
736         }
737     }
onToken(lString8 & token)738     void onToken(lString8 & token) {
739         if (token.empty())
740             return;
741         lString8 t = token;
742         token.clear();
743         //CRLog::trace("state==%d: %s", _state, t.c_str());
744         if (t == "@font-face") {
745             if (_state == 0)
746                 _state = 1; // right after @font
747             return;
748         }
749         if (_state == 1)
750             _state = 0;
751         if (_state == 2) {
752             if (t == "font-family")
753                 _state = 4;
754             else if (t == "font-weight")
755                 _state = 6;
756             else if (t == "font-style")
757                 _state = 8;
758             else if (t == "src")
759                 _state = 10;
760         } else if (_state == 5) {
761             _face = t;
762             _state = 2;
763         } else if (_state == 7) {
764             if (t == "bold")
765                 _bold = true;
766             _state = 2;
767         } else if (_state == 9) {
768             if (t == "italic")
769                 _italic = true;
770             _state = 2;
771         } else if (_state == 11) {
772             if (t == "url") {
773                 _state = 12;
774                 islocal=t;
775             }
776             else if (t=="local") {
777                 _state=12;
778                 islocal=t;
779             }
780             else
781                 _state = 2;
782         }
783     }
onQuotedText(lString8 & token)784     void onQuotedText(lString8 & token) {
785         //CRLog::trace("state==%d: \"%s\"", _state, token.c_str());
786         if (_state == 11 || _state == 13) {
787             if (!token.empty()) {
788                 lString32 ltoken = Utf8ToUnicode(token);
789                 if (ltoken.startsWithNoCase(lString32("res://")) || ltoken.startsWithNoCase(lString32("file://")) )
790                     _url = ltoken;
791                 else
792                     _url = LVCombinePaths(_basePath, ltoken);
793             }
794             _state = 2;
795         } else if (_state == 5) {
796             if (!token.empty()) {
797                 _face = token;
798             }
799             _state = 2;
800         }
801         token.clear();
802     }
deletecomment(lString8 css)803     lString8 deletecomment(lString8 css) {
804         int state;
805         lString8 tmp=lString8("");
806         tmp.reserve( css.length() );
807         char c;
808         state = 0;
809         for (int i=0;i<css.length();i++) {
810             c=css[i];
811             if (state == 0 ) {
812                 if (c == ('/'))           // ex. [/]
813                     state = 1;
814                 else if (c == ('\'') )    // ex. [']
815                     state = 5;
816                 else if (c == ('\"'))     // ex. ["]
817                     state = 7;
818             }
819             else if (state == 1 && c == ('*'))     // ex. [/*]
820                     state = 2;
821             else if (state == 1) {                // ex. [<secure/_stdio.h> or 5/3]
822                     tmp<<('/');
823                     state = 0;
824             }
825             else if (state == 2 && c == ('*'))    // ex. [/*he*]
826                     state = 3;
827             else if (state == 2)                // ex. [/*heh]
828                     state = 2;
829             else if (state == 3 && c == ('/'))    // ex. [/*heh*/]
830                     state = 0;
831             else if (state == 3)                // ex. [/*heh*e]
832                     state = 2;
833             /* Moved up for faster normal path:
834             else if (state == 0 && c == ('\'') )    // ex. [']
835                     state = 5;
836             */
837             else if (state == 5 && c == ('\\'))     // ex. ['\]
838                     state = 6;
839             else if (state == 6)                // ex. ['\n or '\' or '\t etc.]
840                     state = 5;
841             else if (state == 5 && c == ('\'') )   // ex. ['\n' or '\'' or '\t' ect.]
842                     state = 0;
843             /* Moved up for faster normal path:
844             else if (state == 0 && c == ('\"'))    // ex. ["]
845                     state = 7;
846             */
847             else if (state == 8)                // ex. ["\n or "\" or "\t ect.]
848                     state = 7;
849             else if (state == 7 && c == ('\"'))    // ex. ["\n" or "\"" or "\t" ect.]
850                     state = 0;
851             if ((state == 0 && c != ('/')) || state == 5 || state == 6 || state == 7 || state == 8)
852                     tmp<<c;
853         }
854         return tmp;
855     }
parse(lString32 basePath,const lString8 & css)856     void parse(lString32 basePath, const lString8 & css) {
857         _state = 0;
858         _basePath = basePath;
859         lString8 token;
860         char insideQuotes = 0;
861         lString8 css_ = deletecomment(css);
862         for (int i=0; i<css_.length(); i++) {
863             char ch = css_[i];
864             if (insideQuotes || _state == 13) {
865                 if (ch == insideQuotes || (_state == 13 && ch == ')')) {
866                     onQuotedText(token);
867                     insideQuotes =  0;
868                     if (_state == 13)
869                         onToken(ch);
870                 } else {
871                     if (_state == 13 && token.empty() && (ch == '\'' || ch=='\"')) {
872                         insideQuotes = ch;
873                     } else if (ch != ' ' || _state != 13)
874                         token << ch;
875                 }
876                 continue;
877             }
878             if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n') {
879                 onToken(token);
880             } else if (ch == '@' || ch=='-' || ch=='_' || ch=='.' || (ch>='a' && ch <='z') || (ch>='A' && ch <='Z') || (ch>='0' && ch <='9')) {
881                 token << ch;
882             } else if (ch == ':' || ch=='{' || ch == '}' || ch=='(' || ch == ')' || ch == ';' || ch == ',') {
883                 onToken(token);
884                 onToken(ch);
885             } else if (ch == '\'' || ch == '\"') {
886                 onToken(token);
887                 insideQuotes = ch;
888             }
889         }
890     }
891 };
892 
ImportEpubDocument(LVStreamRef stream,ldomDocument * m_doc,LVDocViewCallback * progressCallback,CacheLoadingCallback * formatCallback,bool metadataOnly)893 bool ImportEpubDocument( LVStreamRef stream, ldomDocument * m_doc, LVDocViewCallback * progressCallback, CacheLoadingCallback * formatCallback, bool metadataOnly )
894 {
895     LVContainerRef arc = LVOpenArchieve( stream );
896     if ( arc.isNull() )
897         return false; // not a ZIP archive
898 
899     // check root media type
900     lString32 rootfilePath = EpubGetRootFilePath(arc);
901     if ( rootfilePath.empty() )
902         return false;
903 
904     EncryptedDataContainer * decryptor = new EncryptedDataContainer(arc);
905     if (decryptor->open()) {
906         CRLog::debug("EPUB: encrypted items detected");
907     }
908 
909     LVContainerRef m_arc = LVContainerRef(decryptor);
910 
911     if (decryptor->hasUnsupportedEncryption()) {
912         // DRM!!!
913         createEncryptedEpubWarningDocument(m_doc);
914         return true;
915     }
916 
917     m_doc->setContainer(m_arc);
918 
919     if ( progressCallback )
920         progressCallback->OnLoadFileProgress(1);
921 
922     // read content.opf
923     EpubItems epubItems;
924     //EpubItem * epubToc = NULL; //TODO
925     LVArray<EpubItem*> spineItems;
926     lString32 codeBase;
927     //lString32 css;
928 
929     //
930     {
931         codeBase=LVExtractPath(rootfilePath, false);
932         CRLog::trace("codeBase=%s", LCSTR(codeBase));
933     }
934 
935     LVStreamRef content_stream = m_arc->OpenStream(rootfilePath.c_str(), LVOM_READ);
936     if ( content_stream.isNull() )
937         return false;
938 
939 
940     bool isEpub3 = false;
941     lString32 epubVersion;
942     lString32 navHref; // epub3 TOC
943     lString32 ncxHref; // epub2 TOC
944     lString32 pageMapHref; // epub2 Adobe page-map
945     lString32 pageMapSource;
946     lString32 coverId;
947 
948     LVEmbeddedFontList fontList;
949     EmbeddedFontStyleParser styleParser(fontList);
950 
951     // reading content stream
952     {
953         CRLog::debug("Parsing opf");
954         ldomDocument * doc = LVParseXMLStream( content_stream );
955         if ( !doc )
956             return false;
957 
958 //        // for debug
959 //        {
960 //            LVStreamRef out = LVOpenFileStream("/tmp/content.xml", LVOM_WRITE);
961 //            doc->saveToStream(out, NULL, true);
962 //        }
963 
964         ldomNode * package = doc->nodeFromXPath(lString32("package"));
965         if ( package ) {
966             epubVersion = package->getAttributeValue("version");
967             if ( !epubVersion.empty() && epubVersion[0] >= '3' )
968                 isEpub3 = true;
969         }
970 
971         CRPropRef m_doc_props = m_doc->getProps();
972         // lString32 authors = doc->textFromXPath( cs32("package/metadata/creator"));
973         lString32 title = doc->textFromXPath( cs32("package/metadata/title"));
974         lString32 language = doc->textFromXPath( cs32("package/metadata/language"));
975         lString32 description = doc->textFromXPath( cs32("package/metadata/description"));
976         pageMapSource = doc->textFromXPath( cs32("package/metadata/source"));
977         // m_doc_props->setString(DOC_PROP_AUTHORS, authors);
978         m_doc_props->setString(DOC_PROP_TITLE, title);
979         m_doc_props->setString(DOC_PROP_LANGUAGE, language);
980         m_doc_props->setString(DOC_PROP_DESCRIPTION, description);
981         m_doc_props->setHex(DOC_PROP_FILE_CRC32, stream->getcrc32());
982 
983         // Return possibly multiple <dc:creator> (authors) and <dc:subject> (keywords)
984         // as a single doc_props string with values separated by \n.
985         // (these \n can be replaced on the lua side for the most appropriate display)
986         bool authors_set = false;
987         lString32 authors;
988         for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
989             ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/creator[") << fmt::decimal(i) << "]");
990             if (!item)
991                 break;
992             lString32 author = item->getText().trim();
993             if (authors_set) {
994                 authors << "\n" << author;
995             }
996             else {
997                 authors << author;
998                 authors_set = true;
999             }
1000         }
1001         m_doc_props->setString(DOC_PROP_AUTHORS, authors);
1002 
1003         // There may be multiple <dc:subject> tags, which are usually used for keywords, categories
1004         bool subjects_set = false;
1005         lString32 subjects;
1006         for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
1007             ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/subject[") << fmt::decimal(i) << "]");
1008             if (!item)
1009                 break;
1010             lString32 subject = item->getText().trim();
1011             if (subjects_set) {
1012                 subjects << "\n" << subject;
1013             }
1014             else {
1015                 subjects << subject;
1016                 subjects_set = true;
1017             }
1018         }
1019         m_doc_props->setString(DOC_PROP_KEYWORDS, subjects);
1020 
1021         for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
1022             ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/identifier[") << fmt::decimal(i) << "]");
1023             if (!item)
1024                 break;
1025             lString32 key = item->getText().trim();
1026             if (decryptor->setManglingKey(key)) {
1027                 CRLog::debug("Using font mangling key %s", LCSTR(key));
1028                 break;
1029             }
1030         }
1031 
1032 #if BUILD_LITE!=1
1033         // If there is a cache file, it contains the fully built DOM document
1034         // made from the multiple html fragments in the epub, and also
1035         // m_doc_props which has been serialized.
1036         // No need to do all the below work, except if we are only
1037         // requesting metadata (parsing some bits from the EPUB is still
1038         // less expensive than loading the full cache file).
1039         // We had to wait till here to do that, to not miss font mangling
1040         // key if any.
1041         if (!metadataOnly) {
1042             CRLog::debug("Trying loading from cache");
1043             if ( m_doc->openFromCache(formatCallback, progressCallback) ) {
1044                 CRLog::debug("Loaded from cache");
1045                 if ( progressCallback ) {
1046                     progressCallback->OnLoadFileEnd( );
1047                 }
1048                 delete doc;
1049                 return true;
1050             }
1051             CRLog::debug("Not loaded from cache, parsing epub content");
1052         }
1053 #endif
1054 
1055         CRLog::info("Authors: %s Title: %s", LCSTR(authors), LCSTR(title));
1056         bool hasSeriesMeta = false;
1057         bool hasSeriesIdMeta = false;
1058         for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
1059             // If we've already got all of 'em, we're done
1060             if (hasSeriesIdMeta && !coverId.empty()) {
1061                 break;
1062             }
1063 
1064             ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/meta[") << fmt::decimal(i) << "]");
1065             if ( !item )
1066                 break;
1067 
1068             lString32 name = item->getAttributeValue("name");
1069             // Might come before or after the series stuff
1070             // (e.g., while you might think it'd come early, Calibre appends it during the Send To Device process).
1071             // Fun fact: this isn't part of *either* version of the ePub specs.
1072             // It's simply an agreed-upon convention, given how utterly terrible the actual specs are.
1073             if (coverId.empty() && name == "cover") {
1074                 lString32 content = item->getAttributeValue("content");
1075                 coverId = content;
1076                 continue;
1077             }
1078             // Has to come before calibre:series_index
1079             if (!hasSeriesMeta && name == "calibre:series") {
1080                 lString32 content = item->getAttributeValue("content");
1081                 PreProcessXmlString(content, 0);
1082                 m_doc_props->setString(DOC_PROP_SERIES_NAME, content);
1083                 hasSeriesMeta = true;
1084                 continue;
1085             }
1086             // Has to come after calibre:series
1087             if (hasSeriesMeta && name == "calibre:series_index") {
1088                 lString32 content = item->getAttributeValue("content");
1089                 PreProcessXmlString(content, 0);
1090                 m_doc_props->setString(DOC_PROP_SERIES_NUMBER, content);
1091                 hasSeriesIdMeta = true;
1092                 continue;
1093             }
1094         }
1095 
1096         // Fallback to the ePub 3 spec for cover-image, c.f. https://www.w3.org/publishing/epub3/epub-packages.html#sec-cover-image
1097         if (isEpub3 && coverId.empty()) {
1098             for ( size_t i=1; i<=EPUB_ITEM_MAX_ITER; i++ ) {
1099                 ldomNode * item = doc->nodeFromXPath(lString32("package/manifest/item[") << fmt::decimal(i) << "]");
1100                 if ( !item )
1101                     break;
1102 
1103                 // NOTE: Yes, plural, not a typo... -_-"
1104                 lString32 props = item->getAttributeValue("properties");
1105                 if (!props.empty() && props == "cover-image") {
1106                     lString32 id = item->getAttributeValue("id");
1107                     coverId = id;
1108                     // Can only be one (or none), we're done!
1109                     break;
1110                 }
1111             }
1112         }
1113 
1114         // Fallback to ePub 3 series metadata, c.f., https://www.w3.org/publishing/epub3/epub-packages.html#sec-belongs-to-collection
1115         // Because, yes, they're less standard than Calibre's ;D. Gotta love the ePub specs...
1116         // NOTE: This doesn't include the shittier variant where apparently a collection-type refines a dc:title's id,
1117         //       or something? Not in the specs, so, don't care.
1118         //       c.f., the first branch in https://github.com/koreader/crengine/issues/267#issuecomment-557507150
1119         //       The only similar thing buried deep in the original 3.0 specs is incredibly convoluted:
1120         //       http://idpf.org/epub/30/spec/epub30-publications.html#sec-opf-dctitle
1121         //       That thankfully seems to have been relegated to the past, despite title-type still supporting a collection type:
1122         //       https://www.w3.org/publishing/epub32/epub-packages.html#sec-title-type
1123         if (isEpub3 && !hasSeriesMeta) {
1124             lString32 seriesId;
1125             for ( size_t i=1; i<=EPUB_META_MAX_ITER; i++ ) {
1126                 ldomNode * item = doc->nodeFromXPath(lString32("package/metadata/meta[") << fmt::decimal(i) << "]");
1127                 if ( !item )
1128                     break;
1129 
1130                 lString32 property = item->getAttributeValue("property");
1131 
1132                 // If we don't have a collection yet, try to find one
1133                 // NOTE: The specs say that collections *MAY* be nested (i.e., a belongs-to-collection node may refine another one).
1134                 //       For simplicity's sake, we only honor the first belongs-to-collection node here.
1135                 //       If I had actual test data, I could have instead opted to specifically match on the "parent" collection,
1136                 //       or the most deeply nested one, depending on what made the most sense, but I don't, so, KISS ;).
1137                 if (!hasSeriesMeta) {
1138                     if (property == "belongs-to-collection") {
1139                         lString32 content = item->getText().trim();
1140                         PreProcessXmlString(content, 0);
1141                         m_doc_props->setString(DOC_PROP_SERIES_NAME, content);
1142                         hasSeriesMeta = true;
1143                         seriesId = item->getAttributeValue("id");
1144                         // Next!
1145                         continue;
1146                     }
1147                 }
1148 
1149                 // If we've got a collection, check if other properties refine it...
1150                 if (hasSeriesMeta) {
1151                     // NOTE: We don't really handle series any differently than set, so we don't really care about this...
1152                     /*
1153                     if (property == "collection-type") {
1154                         // Only support valid types (series or set)
1155                         lString32 content = item->getText().trim();
1156                         if (content == "series" || content == "set") {
1157                             lString32 id = item->getAttributeValue("refines");
1158                             // Strip the anchor to match against seriesId
1159                             if (id.startsWith("#")) {
1160                                 id = id.substr(1, id.length() - 1);
1161                             }
1162                             if (id == seriesId) {
1163                                 // Next!
1164                                 continue;
1165                             }
1166                         }
1167                     }
1168                     */
1169                     if (property == "group-position") {
1170                         lString32 id = item->getAttributeValue("refines");
1171                         // Strip the anchor to match against seriesId
1172                         if (id.startsWith("#")) {
1173                             id = id.substr(1, id.length() - 1);
1174                         }
1175                         // If we've got a match, that's our position in the series!
1176                         if (id == seriesId) {
1177                             lString32 content = item->getText().trim();
1178                             PreProcessXmlString(content, 0);
1179                             // NOTE: May contain decimal values (much like calibre:series_index).
1180                             //       c.f., https://github.com/koreader/crengine/pull/346#discussion_r436190907
1181                             m_doc_props->setString(DOC_PROP_SERIES_NUMBER, content);
1182                             // And we're done :)
1183                             break;
1184                         }
1185                     }
1186                 }
1187             }
1188         }
1189 
1190         if (metadataOnly && coverId.empty()) {
1191             // no cover to look for, no need for more work
1192             delete doc;
1193             return true;
1194         }
1195 
1196         if ( progressCallback )
1197             progressCallback->OnLoadFileProgress(2);
1198 
1199         // items
1200         CRLog::debug("opf: reading items");
1201         for ( size_t i=1; i<=EPUB_ITEM_MAX_ITER; i++ ) {
1202             ldomNode * item = doc->nodeFromXPath(lString32("package/manifest/item[") << fmt::decimal(i) << "]");
1203             if ( !item )
1204                 break;
1205             lString32 href = item->getAttributeValue("href");
1206             lString32 mediaType = item->getAttributeValue("media-type");
1207             lString32 id = item->getAttributeValue("id");
1208             if ( !href.empty() && !id.empty() ) {
1209                 href = DecodeHTMLUrlString(href);
1210                 if ( id==coverId ) {
1211                     // coverpage file
1212                     lString32 coverFileName = LVCombinePaths(codeBase, href);
1213                     CRLog::info("EPUB coverpage file: %s", LCSTR(coverFileName));
1214                     LVStreamRef stream = m_arc->OpenStream(coverFileName.c_str(), LVOM_READ);
1215                     if ( !stream.isNull() ) {
1216                         LVImageSourceRef img = LVCreateStreamImageSource(stream);
1217                         if ( !img.isNull() ) {
1218                             CRLog::info("EPUB coverpage image is correct: %d x %d", img->GetWidth(), img->GetHeight() );
1219                             m_doc_props->setString(DOC_PROP_COVER_FILE, coverFileName);
1220                         }
1221                     }
1222                     if (metadataOnly) {
1223                         // coverId found, no need for more work
1224                         delete doc;
1225                         return true;
1226                     }
1227                 }
1228                 EpubItem * epubItem = new EpubItem;
1229                 epubItem->href = href;
1230                 epubItem->id = id;
1231                 epubItem->mediaType = mediaType;
1232                 epubItems.add( epubItem );
1233 
1234                 if ( isEpub3 && navHref.empty() ) {
1235                     lString32 properties = item->getAttributeValue("properties");
1236                     // We met properties="nav scripted"...
1237                     if ( properties == U"nav" || properties.startsWith(U"nav ")
1238                             || properties.endsWith(U" nav") || properties.pos(U" nav ") >= 0 ) {
1239                         navHref = href;
1240                     }
1241                 }
1242 
1243 //                // register embedded document fonts
1244 //                if (mediaType == U"application/vnd.ms-opentype"
1245 //                        || mediaType == U"application/x-font-otf"
1246 //                        || mediaType == U"application/x-font-ttf") { // TODO: more media types?
1247 //                    // TODO:
1248 //                    fontList.add(codeBase + href);
1249 //                }
1250             }
1251             if (mediaType == "text/css") {
1252                 lString32 name = LVCombinePaths(codeBase, href);
1253                 LVStreamRef cssStream = m_arc->OpenStream(name.c_str(), LVOM_READ);
1254                 if (!cssStream.isNull()) {
1255                     lString8 cssFile = UnicodeToUtf8(LVReadTextFile(cssStream));
1256                     lString32 base = name;
1257                     LVExtractLastPathElement(base);
1258                     //CRLog::trace("style: %s", cssFile.c_str());
1259                     styleParser.parse(base, cssFile);
1260                 }
1261                 // Huge CSS files may take some time being parsed, so update progress
1262                 // after each one to get a chance of it being displayed at this point.
1263                 if ( progressCallback )
1264                     progressCallback->OnLoadFileProgress(3);
1265             }
1266         }
1267         CRLog::debug("opf: reading items done.");
1268 
1269         if ( progressCallback )
1270             progressCallback->OnLoadFileProgress(4);
1271 
1272         // spine == itemrefs
1273         if ( epubItems.length()>0 ) {
1274             CRLog::debug("opf: reading spine");
1275             ldomNode * spine = doc->nodeFromXPath( cs32("package/spine") );
1276             if ( spine ) {
1277 
1278                 // <spine toc="ncx" page-map="page-map">
1279                 EpubItem * ncx = epubItems.findById( spine->getAttributeValue("toc") ); //TODO
1280                 if ( ncx!=NULL )
1281                     ncxHref = LVCombinePaths(codeBase, ncx->href);
1282                 EpubItem * page_map = epubItems.findById( spine->getAttributeValue("page-map") );
1283                 if ( page_map!=NULL )
1284                     pageMapHref = LVCombinePaths(codeBase, page_map->href);
1285 
1286                 for ( size_t i=1; i<=EPUB_ITEM_MAX_ITER; i++ ) {
1287                     ldomNode * item = doc->nodeFromXPath(lString32("package/spine/itemref[") << fmt::decimal(i) << "]");
1288                     if ( !item )
1289                         break;
1290                     EpubItem * epubItem = epubItems.findById( item->getAttributeValue("idref") );
1291                     epubItem->nonlinear = lString32(item->getAttributeValue("linear")).lowercase() == U"no";
1292                     if ( epubItem ) {
1293                         // TODO: add to document
1294                         spineItems.add( epubItem );
1295                     }
1296                 }
1297             }
1298             CRLog::debug("opf: reading spine done");
1299         }
1300         delete doc;
1301         CRLog::debug("opf: closed");
1302     }
1303 
1304     if ( spineItems.length()==0 )
1305         return false;
1306 
1307     if (metadataOnly)
1308         return true; // no need for more work
1309 
1310     if ( progressCallback )
1311         progressCallback->OnLoadFileProgress(5);
1312 
1313     lUInt32 saveFlags = m_doc->getDocFlags();
1314     m_doc->setDocFlags( saveFlags );
1315     m_doc->setContainer( m_arc );
1316 
1317     ldomDocumentWriter writer(m_doc);
1318 #if 0
1319     m_doc->setNodeTypes( fb2_elem_table );
1320     m_doc->setAttributeTypes( fb2_attr_table );
1321     m_doc->setNameSpaceTypes( fb2_ns_table );
1322 #endif
1323     //m_doc->setCodeBase( codeBase );
1324 
1325     int fontList_nb_before_head_parsing = fontList.length();
1326     if (!fontList.empty()) {
1327         // set document font list, and register fonts
1328         m_doc->getEmbeddedFontList().set(fontList);
1329         m_doc->registerEmbeddedFonts();
1330     }
1331 
1332     ldomDocumentFragmentWriter appender(&writer, cs32("body"), cs32("DocFragment"), lString32::empty_str );
1333     writer.OnStart(NULL);
1334     writer.OnTagOpenNoAttr(U"", U"body");
1335     int fragmentCount = 0;
1336     size_t spineItemsNb = spineItems.length();
1337     for ( size_t i=0; i<spineItemsNb; i++ ) {
1338         if (spineItems[i]->mediaType == "application/xhtml+xml") {
1339             lString32 name = LVCombinePaths(codeBase, spineItems[i]->href);
1340             lString32 subst = cs32("_doc_fragment_") + fmt::decimal(i);
1341             appender.addPathSubstitution( name, subst );
1342             //CRLog::trace("subst: %s => %s", LCSTR(name), LCSTR(subst));
1343         }
1344     }
1345     int lastProgressPercent = 5;
1346     for ( size_t i=0; i<spineItemsNb; i++ ) {
1347         if ( progressCallback ) {
1348             int percent = 5 + 95 * i / spineItemsNb;
1349             if ( percent > lastProgressPercent ) {
1350                 progressCallback->OnLoadFileProgress(percent);
1351                 lastProgressPercent = percent;
1352             }
1353         }
1354         if (spineItems[i]->mediaType == "application/xhtml+xml") {
1355             lString32 name = LVCombinePaths(codeBase, spineItems[i]->href);
1356             {
1357                 CRLog::debug("Checking fragment: %s", LCSTR(name));
1358                 LVStreamRef stream = m_arc->OpenStream(name.c_str(), LVOM_READ);
1359                 if ( !stream.isNull() ) {
1360                     appender.setCodeBase( name );
1361                     lString32 base = name;
1362                     LVExtractLastPathElement(base);
1363                     //CRLog::trace("base: %s", LCSTR(base));
1364                     //LVXMLParser
1365                     LVHTMLParser parser(stream, &appender);
1366                     appender.setNonLinearFlag(spineItems[i]->nonlinear);
1367                     if ( parser.CheckFormat() && parser.Parse() ) {
1368                         // valid
1369                         fragmentCount++;
1370                         lString8 headCss = appender.getHeadStyleText();
1371                         //CRLog::trace("style: %s", headCss.c_str());
1372                         styleParser.parse(base, headCss);
1373                     } else {
1374                         CRLog::error("Document type is not XML/XHTML for fragment %s", LCSTR(name));
1375                     }
1376                 }
1377             }
1378         }
1379     }
1380 
1381     // Clear any toc items possibly added while parsing the HTML
1382     m_doc->getToc()->clear();
1383     bool has_toc = false;
1384     bool has_pagemap = false;
1385 
1386     // EPUB3 documents may contain both a toc.ncx and a nav xhtml toc.
1387     // We would have preferred to read first a toc.ncx if present, as it
1388     // is more structured than nav toc (all items have a href), but it
1389     // seems Sigil includes a toc.ncx for EPUB3, but does not keep it
1390     // up-to-date, while it does for the nav toc.
1391     if ( isEpub3 && !navHref.empty() ) {
1392         // Parse toc nav if epub3
1393         // http://idpf.org/epub/30/spec/epub30-contentdocs.html#sec-xhtml-nav-def
1394         navHref = LVCombinePaths(codeBase, navHref);
1395         LVStreamRef stream = m_arc->OpenStream(navHref.c_str(), LVOM_READ);
1396         lString32 codeBase = LVExtractPath( navHref );
1397         if ( codeBase.length()>0 && codeBase.lastChar()!='/' )
1398             codeBase.append(1, U'/');
1399         appender.setCodeBase(codeBase);
1400         if ( !stream.isNull() ) {
1401             ldomDocument * navDoc = LVParseXMLStream( stream );
1402             if ( navDoc!=NULL ) {
1403                 // Find <nav epub:type="toc">
1404                 lUInt16 nav_id = navDoc->getElementNameIndex(U"nav");
1405                 ldomNode * navDocRoot = navDoc->getRootNode();
1406                 ldomNode * n = navDocRoot;
1407                 // Kobo falls back to other <nav type=> when no <nav type=toc> is found,
1408                 // let's do the same.
1409                 ldomNode * n_toc = NULL;
1410                 ldomNode * n_landmarks = NULL;
1411                 ldomNode * n_page_list = NULL;
1412                 if (n->isElement() && n->getChildCount() > 0) {
1413                     int nextChildIndex = 0;
1414                     n = n->getChildNode(nextChildIndex);
1415                     while (true) {
1416                         // Check only the first time we met a node (nextChildIndex == 0)
1417                         // and not when we get back to it from a child to process next sibling
1418                         if (nextChildIndex == 0) {
1419                             if ( n->isElement() && n->getNodeId() == nav_id ) {
1420                                 lString32 type = n->getAttributeValue("type");
1421                                 if ( type == U"toc") {
1422                                     n_toc = n;
1423                                 }
1424                                 else if ( type == U"landmarks") {
1425                                     n_landmarks = n;
1426                                 }
1427                                 else if ( type == U"page-list") {
1428                                     n_page_list = n;
1429                                 }
1430                             }
1431                         }
1432                         // Process next child
1433                         if (n->isElement() && nextChildIndex < n->getChildCount()) {
1434                             n = n->getChildNode(nextChildIndex);
1435                             nextChildIndex = 0;
1436                             continue;
1437                         }
1438                         // No more child, get back to parent and have it process our sibling
1439                         nextChildIndex = n->getNodeIndex() + 1;
1440                         n = n->getParentNode();
1441                         if (!n) // back to root node
1442                             break;
1443                         if (n == navDocRoot && nextChildIndex >= n->getChildCount())
1444                             // back to this node, and done with its children
1445                             break;
1446                     }
1447                 }
1448                 if ( !n_toc ) {
1449                     if ( n_landmarks ) {
1450                         n_toc = n_landmarks;
1451                     }
1452                     else if ( n_page_list ) {
1453                         n_toc = n_page_list;
1454                     }
1455                 }
1456                 if ( n_toc ) {
1457                     // "Each nav element may contain an optional heading indicating the title
1458                     // of the navigation list. The heading must be one of H1...H6."
1459                     // We can't do much with this heading (that would not resolve to anything),
1460                     // we could just add it as a top container item for the others, which will
1461                     // be useless (and bothering), so let's just ignore it.
1462                     // Get its first and single <OL> child
1463                     ldomNode * ol_root = n_toc->findChildElement( LXML_NS_ANY, navDoc->getElementNameIndex(U"ol"), -1 );
1464                     if ( ol_root )
1465                         ReadEpubNavToc( m_doc, ol_root, m_doc->getToc(), appender );
1466                 }
1467                 if ( n_page_list ) {
1468                     ldomNode * ol_root = n_page_list->findChildElement( LXML_NS_ANY, navDoc->getElementNameIndex(U"ol"), -1 );
1469                     if ( ol_root )
1470                         ReadEpubNavPageMap( m_doc, ol_root, m_doc->getPageMap(), appender );
1471                 }
1472                 delete navDoc;
1473             }
1474         }
1475     }
1476 
1477     has_toc = m_doc->getToc()->getChildCount() > 0;
1478     has_pagemap = m_doc->getPageMap()->getChildCount() > 0;
1479 
1480     // For EPUB2 (or EPUB3 where no nav toc was found): read ncx toc
1481     // We may also find in the ncx a <pageList> list
1482     if ( ( !has_toc || !has_pagemap ) && !ncxHref.empty() ) {
1483         LVStreamRef stream = m_arc->OpenStream(ncxHref.c_str(), LVOM_READ);
1484         lString32 codeBase = LVExtractPath( ncxHref );
1485         if ( codeBase.length()>0 && codeBase.lastChar()!='/' )
1486             codeBase.append(1, U'/');
1487         appender.setCodeBase(codeBase);
1488         if ( !stream.isNull() ) {
1489             ldomDocument * ncxdoc = LVParseXMLStream( stream );
1490             if ( ncxdoc!=NULL ) {
1491                 if ( !has_toc ) {
1492                     ldomNode * navMap = ncxdoc->nodeFromXPath( cs32("ncx/navMap"));
1493                     if ( navMap!=NULL )
1494                         ReadEpubNcxToc( m_doc, navMap, m_doc->getToc(), appender );
1495                 }
1496                 // http://blog.epubbooks.com/346/marking-up-page-numbers-in-the-epub-ncx/
1497                 if ( !has_pagemap ) {
1498                     ldomNode * pageList = ncxdoc->nodeFromXPath( cs32("ncx/pageList"));
1499                     if ( pageList!=NULL )
1500                         ReadEpubNcxPageList( m_doc, pageList, m_doc->getPageMap(), appender );
1501                 }
1502                 delete ncxdoc;
1503             }
1504         }
1505     }
1506 
1507     has_toc = m_doc->getToc()->getChildCount() > 0;
1508     has_pagemap = m_doc->getPageMap()->getChildCount() > 0;
1509 
1510     // If still no TOC, fallback to using the spine, as Kobo does.
1511     if ( !has_toc ) {
1512         LVTocItem * baseToc = m_doc->getToc();
1513         for ( size_t i=0; i<spineItemsNb; i++ ) {
1514             if (spineItems[i]->mediaType == "application/xhtml+xml") {
1515                 lString32 title = spineItems[i]->id; // nothing much else to use
1516                 lString32 href = appender.convertHref(spineItems[i]->id);
1517                 if ( href.empty() || href[0]!='#' )
1518                     continue;
1519                 ldomNode * target = m_doc->getNodeById(m_doc->getAttrValueIndex(href.substr(1).c_str()));
1520                 if ( !target )
1521                     continue;
1522                 ldomXPointer ptr(target, 0);
1523                 baseToc->addChild(title, ptr, lString32::empty_str);
1524             }
1525         }
1526     }
1527 
1528     // If no pagemap, parse Adobe page-map if there is one
1529     // https://wiki.mobileread.com/wiki/Adobe_Digital_Editions#Page-map
1530     if ( !has_pagemap && !pageMapHref.empty() ) {
1531         LVStreamRef stream = m_arc->OpenStream(pageMapHref.c_str(), LVOM_READ);
1532         lString32 codeBase = LVExtractPath( pageMapHref );
1533         if ( codeBase.length()>0 && codeBase.lastChar()!='/' )
1534             codeBase.append(1, U'/');
1535         appender.setCodeBase(codeBase);
1536         if ( !stream.isNull() ) {
1537             ldomDocument * pagemapdoc = LVParseXMLStream( stream );
1538             if ( pagemapdoc!=NULL ) {
1539                 if ( !has_pagemap ) {
1540                     ldomNode * pageMap = pagemapdoc->nodeFromXPath( cs32("page-map"));
1541                     if ( pageMap!=NULL )
1542                         ReadEpubAdobePageMap( m_doc, pageMap, m_doc->getPageMap(), appender );
1543                 }
1544                 delete pagemapdoc;
1545             }
1546         }
1547     }
1548 
1549     if ( m_doc->getPageMap()->getChildCount() > 0 && !pageMapSource.empty() )
1550         m_doc->getPageMap()->setSource(pageMapSource);
1551 
1552     writer.OnTagClose(U"", U"body");
1553     writer.OnStop();
1554     CRLog::debug("EPUB: %d documents merged", fragmentCount);
1555 
1556     if ( fontList.length() != fontList_nb_before_head_parsing ) {
1557         // New fonts met when parsing <head><style> of some DocFragments
1558         m_doc->unregisterEmbeddedFonts();
1559         // set document font list, and register fonts
1560         m_doc->getEmbeddedFontList().set(fontList);
1561         m_doc->registerEmbeddedFonts();
1562         printf("CRE: document loaded, but styles re-init needed (cause: embedded fonts)\n");
1563         m_doc->forceReinitStyles();
1564         // todo: we could avoid forceReinitStyles() when embedded fonts are disabled
1565         // (but being here is quite rare - and having embedded font disabled even more)
1566     }
1567 
1568     if ( fragmentCount==0 )
1569         return false;
1570 
1571 #if 0
1572     // set stylesheet
1573     //m_doc->getStyleSheet()->clear();
1574     m_doc->setStyleSheet( NULL, true );
1575     //m_doc->getStyleSheet()->parse(m_stylesheet.c_str());
1576     if ( !css.empty() && m_doc->getDocFlag(DOC_FLAG_ENABLE_INTERNAL_STYLES) ) {
1577 
1578         m_doc->setStyleSheet( "p.p { text-align: justify }\n"
1579             "svg { text-align: center }\n"
1580             "i { display: inline; font-style: italic }\n"
1581             "b { display: inline; font-weight: bold }\n"
1582             "abbr { display: inline }\n"
1583             "acronym { display: inline }\n"
1584             "address { display: inline }\n"
1585             "p.title-p { hyphenate: none }\n"
1586 //abbr, acronym, address, blockquote, br, cite, code, dfn, div, em, h1, h2, h3, h4, h5, h6, kbd, p, pre, q, samp, span, strong, var
1587         , false);
1588         m_doc->setStyleSheet( UnicodeToUtf8(css).c_str(), false );
1589         //m_doc->getStyleSheet()->parse(UnicodeToUtf8(css).c_str());
1590     } else {
1591         //m_doc->getStyleSheet()->parse(m_stylesheet.c_str());
1592         //m_doc->setStyleSheet( m_stylesheet.c_str(), false );
1593     }
1594 #endif
1595 #if 0
1596     LVStreamRef out = LVOpenFileStream( U"c:\\doc.xml" , LVOM_WRITE );
1597     if ( !out.isNull() )
1598         m_doc->saveToStream( out, "utf-8" );
1599 #endif
1600 
1601     // DONE!
1602     if ( progressCallback ) {
1603         progressCallback->OnLoadFileEnd( );
1604         m_doc->compact();
1605         m_doc->dumpStatistics();
1606     }
1607 
1608     // save compound XML document, for testing:
1609     //m_doc->saveToStream(LVOpenFileStream("/tmp/epub_dump.xml", LVOM_WRITE), NULL, true);
1610 
1611     return true;
1612 
1613 }
1614