1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /*
3  * This file is part of the libetonyek project.
4  *
5  * This Source Code Form is subject to the terms of the Mozilla Public
6  * License, v. 2.0. If a copy of the MPL was not distributed with this
7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8  */
9 
10 #include <libetonyek/libetonyek.h>
11 
12 #include <cassert>
13 #include <cstring>
14 #include <memory>
15 
16 #include <boost/algorithm/string/predicate.hpp>
17 #include <boost/optional.hpp>
18 
19 #include <libxml/xmlreader.h>
20 
21 #include "libetonyek_utils.h"
22 #include "libetonyek_xml.h"
23 #include "IWAMessage.h"
24 #include "IWASnappyStream.h"
25 #include "IWORKPresentationRedirector.h"
26 #include "IWORKSpreadsheetRedirector.h"
27 #include "IWORKSubDirStream.h"
28 #include "IWORKTextRedirector.h"
29 #include "IWORKTokenizer.h"
30 #include "IWORKZlibStream.h"
31 #include "KEY1Dictionary.h"
32 #include "KEY1Parser.h"
33 #include "KEY1Token.h"
34 #include "KEY2Dictionary.h"
35 #include "KEY2Parser.h"
36 #include "KEY2Token.h"
37 #include "KEY6Parser.h"
38 #include "KEYCollector.h"
39 #include "NUMCollector.h"
40 #include "NUM1Dictionary.h"
41 #include "NUM1Parser.h"
42 #include "NUM1Token.h"
43 #include "NUM3Parser.h"
44 #include "PAGCollector.h"
45 #include "PAG1Dictionary.h"
46 #include "PAG1Parser.h"
47 #include "PAG1Token.h"
48 #include "PAG5Parser.h"
49 
50 
51 using std::shared_ptr;
52 using std::string;
53 
54 using librevenge::RVNG_SEEK_SET;
55 
56 namespace libetonyek
57 {
58 
59 namespace
60 {
61 
62 enum Format
63 {
64   FORMAT_UNKNOWN,
65   FORMAT_XML1,
66   FORMAT_XML2,
67   FORMAT_BINARY
68 };
69 
70 struct DetectionInfo
71 {
72   explicit DetectionInfo(EtonyekDocument::Type type = EtonyekDocument::TYPE_UNKNOWN);
73 
74   RVNGInputStreamPtr_t m_input;
75   RVNGInputStreamPtr_t m_package;
76   RVNGInputStreamPtr_t m_fragments;
77   EtonyekDocument::Confidence m_confidence;
78   EtonyekDocument::Type m_type;
79   Format m_format;
80 };
81 
DetectionInfo(const EtonyekDocument::Type type)82 DetectionInfo::DetectionInfo(const EtonyekDocument::Type type)
83   : m_input()
84   , m_package()
85   , m_fragments()
86   , m_confidence(EtonyekDocument::CONFIDENCE_NONE)
87   , m_type(type)
88   , m_format(FORMAT_UNKNOWN)
89 {
90 }
91 
probeXMLFormat(const Format format,const EtonyekDocument::Type type,const int docId,const IWORKTokenizer & tokenizer,const char * const name,const char * const ns,DetectionInfo & info)92 bool probeXMLFormat(const Format format, const EtonyekDocument::Type type, const int docId,
93                     const IWORKTokenizer &tokenizer, const char *const name, const char *const ns,
94                     DetectionInfo &info)
95 {
96   if (((info.m_format == format) || (info.m_format == FORMAT_UNKNOWN))
97       && ((info.m_type == type) || info.m_type == EtonyekDocument::TYPE_UNKNOWN))
98   {
99     if (tokenizer.getQualifiedId(name, ns) == docId)
100     {
101       info.m_format = format;
102       info.m_type = type;
103       return true;
104     }
105   }
106   return false;
107 }
108 
109 namespace
110 {
handleError(void *,const char *,xmlParserSeverities,xmlTextReaderLocatorPtr)111 void handleError(void * /*arg*/, const char * /*msg*/, xmlParserSeverities /*severity*/, xmlTextReaderLocatorPtr /*locator*/)
112 {
113 }
114 }
115 
probeXML(DetectionInfo & info)116 bool probeXML(DetectionInfo &info)
117 {
118   const auto reader = xmlReaderForStream(info.m_input);
119   if (!reader)
120     return false;
121 
122   xmlTextReaderSetErrorHandler(reader.get(), handleError, nullptr);
123 
124   int ret = 0;
125   bool checkAPXL=false;
126   do
127   {
128     ret = xmlTextReaderRead(reader.get());
129     if (ret==1 && XML_READER_TYPE_DOCUMENT_TYPE==xmlTextReaderNodeType(reader.get()))
130     {
131       auto name = xmlTextReaderConstName(reader.get());
132       if (name) checkAPXL=std::string((char const *)name)=="APXL";
133     }
134   }
135   while ((1 == ret) && (XML_READER_TYPE_ELEMENT != xmlTextReaderNodeType(reader.get())));
136 
137   if (1 != ret)
138     return false;
139 
140   const char *const name = char_cast(xmlTextReaderConstLocalName(reader.get()));
141   const char *const ns = char_cast(xmlTextReaderConstNamespaceUri(reader.get()));
142   if (probeXMLFormat(FORMAT_XML2, EtonyekDocument::TYPE_KEYNOTE, KEY2Token::NS_URI_KEY | KEY2Token::presentation,
143                      KEY2Token::getTokenizer(), name, ns, info))
144     return true;
145   if (probeXMLFormat(FORMAT_XML2, EtonyekDocument::TYPE_NUMBERS, NUM1Token::NS_URI_LS | NUM1Token::document,
146                      NUM1Token::getTokenizer(), name, ns, info))
147     return true;
148   if (probeXMLFormat(FORMAT_XML2, EtonyekDocument::TYPE_PAGES, PAG1Token::NS_URI_SL | PAG1Token::document,
149                      PAG1Token::getTokenizer(), name, ns, info))
150     return true;
151   // Keynote 1 files define the document type with <!DOCTYPE APXL>
152   if (probeXMLFormat(FORMAT_XML1, EtonyekDocument::TYPE_KEYNOTE, KEY1Token::NS_URI_KEY | KEY1Token::presentation,
153                      KEY1Token::getTokenizer(), name, (ns||!checkAPXL) ? ns : "http://developer.apple.com/schemas/APXL", info))
154     return true;
155   return false;
156 }
157 
probeBinary(DetectionInfo & info)158 bool probeBinary(DetectionInfo &info)
159 {
160   const uint64_t headerLen = readUVar(info.m_input);
161   if (headerLen < 8)
162     return false;
163 
164   EtonyekDocument::Type detected = EtonyekDocument::TYPE_UNKNOWN;
165 
166   const auto pos = uint64_t(info.m_input->tell());
167   const IWAMessage header(info.m_input, (unsigned long) headerLen);
168 
169   if (header.uint32(1) && header.message(2) && header.message(2).uint32(1) && (header.uint32(1).get() == 1))
170   {
171     switch (header.message(2).uint32(1).get())
172     {
173     case 1 :
174       if (header.message(2).uint32(3))
175       {
176         uint32_t dataLen = 0;
177         for (auto const &infoT : header.message(2))
178         {
179           if (infoT.uint32(3)) dataLen += infoT.uint32(3).get();
180         }
181         const IWAMessage data(info.m_input, long(pos + headerLen), long(pos + headerLen + dataLen));
182         // keynote: presentation ref in 2
183         // number: sheet ref in 1
184         if (!data.message(1))
185           detected = EtonyekDocument::TYPE_KEYNOTE;
186         else if (!data.message(2))
187           detected = EtonyekDocument::TYPE_NUMBERS;
188         else
189         {
190           unsigned potentialRef[2];
191           for (unsigned test=1; test<=2; ++test)
192           {
193             auto ref=data.message(test).uint32(1).optional();
194             if (!ref)
195             {
196               detected = test==1 ? EtonyekDocument::TYPE_KEYNOTE : EtonyekDocument::TYPE_NUMBERS;
197               break;
198             }
199             potentialRef[test-1]=get(ref);
200           }
201           if (detected != EtonyekDocument::TYPE_UNKNOWN)
202             break;
203           // undecise, try to find the first ref
204           IWAObjectIndex objIndex(info.m_fragments, info.m_package);
205           objIndex.parse();
206           auto type = objIndex.getObjectType(potentialRef[0]);
207           detected = type && get(type)==2 ?
208                      EtonyekDocument::TYPE_NUMBERS : EtonyekDocument::TYPE_KEYNOTE;
209         }
210       }
211       break;
212     case 10000 :
213       detected = EtonyekDocument::TYPE_PAGES;
214       break;
215     default:
216       break;
217     }
218   }
219 
220   if ((info.m_type == EtonyekDocument::TYPE_UNKNOWN) || (info.m_type == detected))
221   {
222     info.m_type = detected;
223     return true;
224   }
225   return false;
226 }
227 
getSubStream(const RVNGInputStreamPtr_t & input,const char * const name)228 RVNGInputStreamPtr_t getSubStream(const RVNGInputStreamPtr_t &input, const char *const name)
229 {
230   return RVNGInputStreamPtr_t(input->getSubStreamByName(name));
231 }
232 
getUncompressedSubStream(const RVNGInputStreamPtr_t & input,const char * const name,bool snappy=false)233 RVNGInputStreamPtr_t getUncompressedSubStream(const RVNGInputStreamPtr_t &input, const char *const name, bool snappy = false) try
234 {
235   const RVNGInputStreamPtr_t compressed(input->getSubStreamByName(name));
236   if (bool(compressed))
237   {
238     if (snappy)
239       return RVNGInputStreamPtr_t(new IWASnappyStream(compressed));
240     return RVNGInputStreamPtr_t(new IWORKZlibStream(compressed));
241   }
242   return RVNGInputStreamPtr_t();
243 }
244 catch (...)
245 {
246   return RVNGInputStreamPtr_t();
247 }
248 
detectBinary(RVNGInputStreamPtr_t input,DetectionInfo & info)249 bool detectBinary(RVNGInputStreamPtr_t input, DetectionInfo &info)
250 {
251   assert(input->isStructured());
252 
253   if (input->existsSubStream("Metadata/DocumentIdentifier"))
254     info.m_package = input;
255 
256   if (input->existsSubStream("Index.zip"))
257   {
258     RVNGInputStreamPtr_t zipInput = getSubStream(input, "Index.zip");
259     if (bool(zipInput))
260       input = zipInput;
261   }
262 
263   const bool hasDocument = input->existsSubStream("Index/Document.iwa");
264 
265   if (hasDocument)
266   {
267     info.m_format = FORMAT_BINARY;
268     info.m_fragments = input;
269     info.m_input = getUncompressedSubStream(input, "Index/Document.iwa", true);
270   }
271 
272   return hasDocument;
273 }
274 
queryTopDirStream(const RVNGInputStreamPtr_t & input)275 RVNGInputStreamPtr_t queryTopDirStream(const RVNGInputStreamPtr_t &input)
276 {
277   assert(input->isStructured());
278 
279   string top;
280 
281   // find the common top level dir of all substream names, if there is one
282   for (unsigned i = 0; i < input->subStreamCount(); ++i)
283   {
284     const char *const path = input->subStreamName(i);
285     if (path)
286     {
287       if (top.empty())
288       {
289         // initialize top dir
290         const char *const pos = std::strchr(path, '/');
291         if (pos)
292           top.assign(path, std::size_t(pos - path));
293         else
294           top = path;
295       }
296       else
297       {
298         // check that the current path starts with top dir
299         if (!boost::starts_with(path, top))
300           return RVNGInputStreamPtr_t();
301         const char end = path[top.size()];
302         if (end != '/' && end != '\0')
303           return RVNGInputStreamPtr_t();
304       }
305     }
306   }
307 
308   RVNGInputStreamPtr_t stream;
309   if (!top.empty())
310     stream.reset(new IWORKSubDirStream(input, top));
311 
312   return stream;
313 }
314 
detect(const RVNGInputStreamPtr_t & input,DetectionInfo & info)315 bool detect(const RVNGInputStreamPtr_t &input, DetectionInfo &info)
316 {
317   if (input->isStructured())
318   {
319     if ((info.m_format == FORMAT_BINARY) || (info.m_format == FORMAT_UNKNOWN))
320     {
321       if (!detectBinary(input, info))
322       {
323         RVNGInputStreamPtr_t dir = queryTopDirStream(input);
324         if (dir)
325           detectBinary(dir, info);
326       }
327     }
328 
329     if ((info.m_format == FORMAT_XML2) || (info.m_format == FORMAT_UNKNOWN))
330     {
331       info.m_package = input;
332 
333       if ((info.m_type == EtonyekDocument::TYPE_KEYNOTE) || (info.m_type == EtonyekDocument::TYPE_UNKNOWN))
334       {
335         if (input->existsSubStream("index.apxl"))
336         {
337           info.m_format = FORMAT_XML2;
338           info.m_type = EtonyekDocument::TYPE_KEYNOTE;
339           info.m_input = getSubStream(input, "index.apxl");
340         }
341         else if (input->existsSubStream("index.apxl.gz"))
342         {
343           info.m_format = FORMAT_XML2;
344           info.m_type = EtonyekDocument::TYPE_KEYNOTE;
345           info.m_input = getUncompressedSubStream(input, "index.apxl.gz");
346         }
347       }
348 
349       if ((info.m_type == EtonyekDocument::TYPE_NUMBERS) || (info.m_type == EtonyekDocument::TYPE_PAGES) || (info.m_type == EtonyekDocument::TYPE_UNKNOWN))
350       {
351         if (input->existsSubStream("index.xml"))
352         {
353           info.m_format = FORMAT_XML2;
354           info.m_input = getSubStream(input, "index.xml");
355         }
356         else if (input->existsSubStream("index.xml.gz"))
357         {
358           info.m_format = FORMAT_XML2;
359           info.m_input = getUncompressedSubStream(input, "index.xml.gz");
360         }
361       }
362     }
363 
364     if (info.m_format == FORMAT_XML1 || info.m_format == FORMAT_UNKNOWN)
365     {
366       info.m_package = input;
367 
368       if (input->existsSubStream("presentation.apxl"))
369       {
370         info.m_type = EtonyekDocument::TYPE_KEYNOTE;
371         info.m_format = FORMAT_XML1;
372         info.m_input = getSubStream(input, "presentation.apxl");
373       }
374       else if (input->existsSubStream("presentation.apxl.gz"))
375       {
376         info.m_type = EtonyekDocument::TYPE_KEYNOTE;
377         info.m_format = FORMAT_XML1;
378         info.m_input = getUncompressedSubStream(input, "presentation.apxl.gz");
379       }
380     }
381   }
382   else
383   {
384     try
385     {
386       info.m_input = std::make_shared<IWORKZlibStream>(input);
387     }
388     catch (...)
389     {
390       info.m_input = input;
391     }
392   }
393 
394   if (bool(info.m_input))
395   {
396     assert(!info.m_input->isStructured());
397     info.m_input->seek(0, RVNG_SEEK_SET);
398 
399     bool supported = false;
400     if (info.m_format == FORMAT_BINARY)
401       supported = probeBinary(info);
402     else
403       supported = probeXML(info);
404     if (supported)
405       info.m_confidence = bool(info.m_package) ? EtonyekDocument::CONFIDENCE_EXCELLENT : EtonyekDocument::CONFIDENCE_SUPPORTED_PART;
406   }
407 
408   if (info.m_confidence != EtonyekDocument::CONFIDENCE_NONE)
409   {
410     assert(EtonyekDocument::TYPE_UNKNOWN != info.m_type);
411     assert(FORMAT_UNKNOWN != info.m_format);
412     assert(bool(info.m_input));
413     if (info.m_confidence == EtonyekDocument::CONFIDENCE_EXCELLENT)
414     {
415       assert(bool(info.m_package));
416     }
417   }
418 
419   return info.m_confidence != EtonyekDocument::CONFIDENCE_NONE;
420 }
421 
422 }
423 
isSupported(librevenge::RVNGInputStream * const input,EtonyekDocument::Type * type)424 ETONYEKAPI EtonyekDocument::Confidence EtonyekDocument::isSupported(librevenge::RVNGInputStream *const input, EtonyekDocument::Type *type) try
425 {
426   if (!input)
427     return CONFIDENCE_NONE;
428 
429   if (type)
430     *type = TYPE_UNKNOWN;
431 
432   DetectionInfo info;
433 
434   if (detect(RVNGInputStreamPtr_t(input, EtonyekDummyDeleter()), info))
435   {
436     if (type)
437       *type = info.m_type;
438     return info.m_confidence;
439   }
440 
441   return CONFIDENCE_NONE;
442 }
443 catch (...)
444 {
445   return CONFIDENCE_NONE;
446 }
447 
parse(librevenge::RVNGInputStream * const input,librevenge::RVNGPresentationInterface * const generator)448 ETONYEKAPI bool EtonyekDocument::parse(librevenge::RVNGInputStream *const input, librevenge::RVNGPresentationInterface *const generator) try
449 {
450   if (!input || !generator)
451     return false;
452 
453   DetectionInfo info(EtonyekDocument::TYPE_KEYNOTE);
454 
455   if (!detect(RVNGInputStreamPtr_t(input, EtonyekDummyDeleter()), info))
456     return false;
457 
458   info.m_input->seek(0, librevenge::RVNG_SEEK_SET);
459 
460   IWORKPresentationRedirector redirector(generator);
461   KEYCollector collector(&redirector);
462   if (info.m_format == FORMAT_XML1)
463   {
464     KEY1Dictionary dict;
465     input->seek(0, librevenge::RVNG_SEEK_SET);
466     shared_ptr<KEY1Parser> key1Parser(new KEY1Parser(info.m_input, info.m_package, collector, dict));
467     return key1Parser->parse();
468   }
469   else if (info.m_format == FORMAT_XML2)
470   {
471     KEY2Dictionary dict;
472     shared_ptr<KEY2Parser> key2Parser(new KEY2Parser(info.m_input, info.m_package, collector, dict));
473     return key2Parser->parse();
474   }
475   else if (info.m_format == FORMAT_BINARY)
476   {
477     KEY6Parser parser(info.m_fragments, info.m_package, collector);
478     return parser.parse();
479   }
480 
481   ETONYEK_DEBUG_MSG(("EtonyekDocument::parse: unhandled format %d\n", info.m_format));
482   return false;
483 }
484 catch (...)
485 {
486   return false;
487 }
488 
parse(librevenge::RVNGInputStream * const input,librevenge::RVNGSpreadsheetInterface * const document)489 ETONYEKAPI bool EtonyekDocument::parse(librevenge::RVNGInputStream *const input, librevenge::RVNGSpreadsheetInterface *const document) try
490 {
491   if (!input || !document)
492     return false;
493 
494   DetectionInfo info(EtonyekDocument::TYPE_NUMBERS);
495 
496   if (!detect(RVNGInputStreamPtr_t(input, EtonyekDummyDeleter()), info))
497     return false;
498 
499   info.m_input->seek(0, librevenge::RVNG_SEEK_SET);
500 
501   IWORKSpreadsheetRedirector redirector(document);
502   NUMCollector collector(&redirector);
503   if (info.m_format == FORMAT_XML2)
504   {
505     NUM1Dictionary dict;
506     NUM1Parser parser(info.m_input, info.m_package, collector, &dict);
507     return parser.parse();
508   }
509   else if (info.m_format == FORMAT_BINARY)
510   {
511     NUM3Parser parser(info.m_fragments, info.m_package, collector);
512     return parser.parse();
513   }
514 
515   ETONYEK_DEBUG_MSG(("EtonyekDocument::parse: unhandled format %d\n", info.m_format));
516   return false;
517 }
518 catch (...)
519 {
520   return false;
521 }
522 
parse(librevenge::RVNGInputStream * const input,librevenge::RVNGTextInterface * const document)523 ETONYEKAPI bool EtonyekDocument::parse(librevenge::RVNGInputStream *const input, librevenge::RVNGTextInterface *const document) try
524 {
525   if (!input || !document)
526     return false;
527 
528   DetectionInfo info(EtonyekDocument::TYPE_PAGES);
529 
530   if (!detect(RVNGInputStreamPtr_t(input, EtonyekDummyDeleter()), info))
531     return false;
532 
533   info.m_input->seek(0, librevenge::RVNG_SEEK_SET);
534 
535   IWORKTextRedirector redirector(document);
536   PAGCollector collector(&redirector);
537   if (info.m_format == FORMAT_XML2)
538   {
539     PAG1Dictionary dict;
540     PAG1Parser parser(info.m_input, info.m_package, collector, &dict);
541     return parser.parse();
542   }
543   else if (info.m_format == FORMAT_BINARY)
544   {
545     PAG5Parser parser(info.m_fragments, info.m_package, collector);
546     return parser.parse();
547   }
548 
549   ETONYEK_DEBUG_MSG(("EtonyekDocument::parse: unhandled format %d\n", info.m_format));
550   return false;
551 }
552 catch (...)
553 {
554   return false;
555 }
556 
557 }
558 
559 /* vim:set shiftwidth=2 softtabstop=2 expandtab: */
560