1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /*
3 * This file is part of the libetonyek project.
4 *
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 */
9
10 #include <libetonyek/libetonyek.h>
11
12 #include <cassert>
13 #include <cstring>
14 #include <memory>
15
16 #include <boost/algorithm/string/predicate.hpp>
17 #include <boost/optional.hpp>
18
19 #include <libxml/xmlreader.h>
20
21 #include "libetonyek_utils.h"
22 #include "libetonyek_xml.h"
23 #include "IWAMessage.h"
24 #include "IWASnappyStream.h"
25 #include "IWORKPresentationRedirector.h"
26 #include "IWORKSpreadsheetRedirector.h"
27 #include "IWORKSubDirStream.h"
28 #include "IWORKTextRedirector.h"
29 #include "IWORKTokenizer.h"
30 #include "IWORKZlibStream.h"
31 #include "KEY1Dictionary.h"
32 #include "KEY1Parser.h"
33 #include "KEY1Token.h"
34 #include "KEY2Dictionary.h"
35 #include "KEY2Parser.h"
36 #include "KEY2Token.h"
37 #include "KEY6Parser.h"
38 #include "KEYCollector.h"
39 #include "NUMCollector.h"
40 #include "NUM1Dictionary.h"
41 #include "NUM1Parser.h"
42 #include "NUM1Token.h"
43 #include "NUM3Parser.h"
44 #include "PAGCollector.h"
45 #include "PAG1Dictionary.h"
46 #include "PAG1Parser.h"
47 #include "PAG1Token.h"
48 #include "PAG5Parser.h"
49
50
51 using std::shared_ptr;
52 using std::string;
53
54 using librevenge::RVNG_SEEK_SET;
55
56 namespace libetonyek
57 {
58
59 namespace
60 {
61
62 enum Format
63 {
64 FORMAT_UNKNOWN,
65 FORMAT_XML1,
66 FORMAT_XML2,
67 FORMAT_BINARY
68 };
69
70 struct DetectionInfo
71 {
72 explicit DetectionInfo(EtonyekDocument::Type type = EtonyekDocument::TYPE_UNKNOWN);
73
74 RVNGInputStreamPtr_t m_input;
75 RVNGInputStreamPtr_t m_package;
76 RVNGInputStreamPtr_t m_fragments;
77 EtonyekDocument::Confidence m_confidence;
78 EtonyekDocument::Type m_type;
79 Format m_format;
80 };
81
DetectionInfo(const EtonyekDocument::Type type)82 DetectionInfo::DetectionInfo(const EtonyekDocument::Type type)
83 : m_input()
84 , m_package()
85 , m_fragments()
86 , m_confidence(EtonyekDocument::CONFIDENCE_NONE)
87 , m_type(type)
88 , m_format(FORMAT_UNKNOWN)
89 {
90 }
91
probeXMLFormat(const Format format,const EtonyekDocument::Type type,const int docId,const IWORKTokenizer & tokenizer,const char * const name,const char * const ns,DetectionInfo & info)92 bool probeXMLFormat(const Format format, const EtonyekDocument::Type type, const int docId,
93 const IWORKTokenizer &tokenizer, const char *const name, const char *const ns,
94 DetectionInfo &info)
95 {
96 if (((info.m_format == format) || (info.m_format == FORMAT_UNKNOWN))
97 && ((info.m_type == type) || info.m_type == EtonyekDocument::TYPE_UNKNOWN))
98 {
99 if (tokenizer.getQualifiedId(name, ns) == docId)
100 {
101 info.m_format = format;
102 info.m_type = type;
103 return true;
104 }
105 }
106 return false;
107 }
108
109 namespace
110 {
handleError(void *,const char *,xmlParserSeverities,xmlTextReaderLocatorPtr)111 void handleError(void * /*arg*/, const char * /*msg*/, xmlParserSeverities /*severity*/, xmlTextReaderLocatorPtr /*locator*/)
112 {
113 }
114 }
115
probeXML(DetectionInfo & info)116 bool probeXML(DetectionInfo &info)
117 {
118 const auto reader = xmlReaderForStream(info.m_input);
119 if (!reader)
120 return false;
121
122 xmlTextReaderSetErrorHandler(reader.get(), handleError, nullptr);
123
124 int ret = 0;
125 bool checkAPXL=false;
126 do
127 {
128 ret = xmlTextReaderRead(reader.get());
129 if (ret==1 && XML_READER_TYPE_DOCUMENT_TYPE==xmlTextReaderNodeType(reader.get()))
130 {
131 auto name = xmlTextReaderConstName(reader.get());
132 if (name) checkAPXL=std::string((char const *)name)=="APXL";
133 }
134 }
135 while ((1 == ret) && (XML_READER_TYPE_ELEMENT != xmlTextReaderNodeType(reader.get())));
136
137 if (1 != ret)
138 return false;
139
140 const char *const name = char_cast(xmlTextReaderConstLocalName(reader.get()));
141 const char *const ns = char_cast(xmlTextReaderConstNamespaceUri(reader.get()));
142 if (probeXMLFormat(FORMAT_XML2, EtonyekDocument::TYPE_KEYNOTE, KEY2Token::NS_URI_KEY | KEY2Token::presentation,
143 KEY2Token::getTokenizer(), name, ns, info))
144 return true;
145 if (probeXMLFormat(FORMAT_XML2, EtonyekDocument::TYPE_NUMBERS, NUM1Token::NS_URI_LS | NUM1Token::document,
146 NUM1Token::getTokenizer(), name, ns, info))
147 return true;
148 if (probeXMLFormat(FORMAT_XML2, EtonyekDocument::TYPE_PAGES, PAG1Token::NS_URI_SL | PAG1Token::document,
149 PAG1Token::getTokenizer(), name, ns, info))
150 return true;
151 // Keynote 1 files define the document type with <!DOCTYPE APXL>
152 if (probeXMLFormat(FORMAT_XML1, EtonyekDocument::TYPE_KEYNOTE, KEY1Token::NS_URI_KEY | KEY1Token::presentation,
153 KEY1Token::getTokenizer(), name, (ns||!checkAPXL) ? ns : "http://developer.apple.com/schemas/APXL", info))
154 return true;
155 return false;
156 }
157
probeBinary(DetectionInfo & info)158 bool probeBinary(DetectionInfo &info)
159 {
160 const uint64_t headerLen = readUVar(info.m_input);
161 if (headerLen < 8)
162 return false;
163
164 EtonyekDocument::Type detected = EtonyekDocument::TYPE_UNKNOWN;
165
166 const auto pos = uint64_t(info.m_input->tell());
167 const IWAMessage header(info.m_input, (unsigned long) headerLen);
168
169 if (header.uint32(1) && header.message(2) && header.message(2).uint32(1) && (header.uint32(1).get() == 1))
170 {
171 switch (header.message(2).uint32(1).get())
172 {
173 case 1 :
174 if (header.message(2).uint32(3))
175 {
176 uint32_t dataLen = 0;
177 for (auto const &infoT : header.message(2))
178 {
179 if (infoT.uint32(3)) dataLen += infoT.uint32(3).get();
180 }
181 const IWAMessage data(info.m_input, long(pos + headerLen), long(pos + headerLen + dataLen));
182 // keynote: presentation ref in 2
183 // number: sheet ref in 1
184 if (!data.message(1))
185 detected = EtonyekDocument::TYPE_KEYNOTE;
186 else if (!data.message(2))
187 detected = EtonyekDocument::TYPE_NUMBERS;
188 else
189 {
190 unsigned potentialRef[2];
191 for (unsigned test=1; test<=2; ++test)
192 {
193 auto ref=data.message(test).uint32(1).optional();
194 if (!ref)
195 {
196 detected = test==1 ? EtonyekDocument::TYPE_KEYNOTE : EtonyekDocument::TYPE_NUMBERS;
197 break;
198 }
199 potentialRef[test-1]=get(ref);
200 }
201 if (detected != EtonyekDocument::TYPE_UNKNOWN)
202 break;
203 // undecise, try to find the first ref
204 IWAObjectIndex objIndex(info.m_fragments, info.m_package);
205 objIndex.parse();
206 auto type = objIndex.getObjectType(potentialRef[0]);
207 detected = type && get(type)==2 ?
208 EtonyekDocument::TYPE_NUMBERS : EtonyekDocument::TYPE_KEYNOTE;
209 }
210 }
211 break;
212 case 10000 :
213 detected = EtonyekDocument::TYPE_PAGES;
214 break;
215 default:
216 break;
217 }
218 }
219
220 if ((info.m_type == EtonyekDocument::TYPE_UNKNOWN) || (info.m_type == detected))
221 {
222 info.m_type = detected;
223 return true;
224 }
225 return false;
226 }
227
getSubStream(const RVNGInputStreamPtr_t & input,const char * const name)228 RVNGInputStreamPtr_t getSubStream(const RVNGInputStreamPtr_t &input, const char *const name)
229 {
230 return RVNGInputStreamPtr_t(input->getSubStreamByName(name));
231 }
232
getUncompressedSubStream(const RVNGInputStreamPtr_t & input,const char * const name,bool snappy=false)233 RVNGInputStreamPtr_t getUncompressedSubStream(const RVNGInputStreamPtr_t &input, const char *const name, bool snappy = false) try
234 {
235 const RVNGInputStreamPtr_t compressed(input->getSubStreamByName(name));
236 if (bool(compressed))
237 {
238 if (snappy)
239 return RVNGInputStreamPtr_t(new IWASnappyStream(compressed));
240 return RVNGInputStreamPtr_t(new IWORKZlibStream(compressed));
241 }
242 return RVNGInputStreamPtr_t();
243 }
244 catch (...)
245 {
246 return RVNGInputStreamPtr_t();
247 }
248
detectBinary(RVNGInputStreamPtr_t input,DetectionInfo & info)249 bool detectBinary(RVNGInputStreamPtr_t input, DetectionInfo &info)
250 {
251 assert(input->isStructured());
252
253 if (input->existsSubStream("Metadata/DocumentIdentifier"))
254 info.m_package = input;
255
256 if (input->existsSubStream("Index.zip"))
257 {
258 RVNGInputStreamPtr_t zipInput = getSubStream(input, "Index.zip");
259 if (bool(zipInput))
260 input = zipInput;
261 }
262
263 const bool hasDocument = input->existsSubStream("Index/Document.iwa");
264
265 if (hasDocument)
266 {
267 info.m_format = FORMAT_BINARY;
268 info.m_fragments = input;
269 info.m_input = getUncompressedSubStream(input, "Index/Document.iwa", true);
270 }
271
272 return hasDocument;
273 }
274
queryTopDirStream(const RVNGInputStreamPtr_t & input)275 RVNGInputStreamPtr_t queryTopDirStream(const RVNGInputStreamPtr_t &input)
276 {
277 assert(input->isStructured());
278
279 string top;
280
281 // find the common top level dir of all substream names, if there is one
282 for (unsigned i = 0; i < input->subStreamCount(); ++i)
283 {
284 const char *const path = input->subStreamName(i);
285 if (path)
286 {
287 if (top.empty())
288 {
289 // initialize top dir
290 const char *const pos = std::strchr(path, '/');
291 if (pos)
292 top.assign(path, std::size_t(pos - path));
293 else
294 top = path;
295 }
296 else
297 {
298 // check that the current path starts with top dir
299 if (!boost::starts_with(path, top))
300 return RVNGInputStreamPtr_t();
301 const char end = path[top.size()];
302 if (end != '/' && end != '\0')
303 return RVNGInputStreamPtr_t();
304 }
305 }
306 }
307
308 RVNGInputStreamPtr_t stream;
309 if (!top.empty())
310 stream.reset(new IWORKSubDirStream(input, top));
311
312 return stream;
313 }
314
detect(const RVNGInputStreamPtr_t & input,DetectionInfo & info)315 bool detect(const RVNGInputStreamPtr_t &input, DetectionInfo &info)
316 {
317 if (input->isStructured())
318 {
319 if ((info.m_format == FORMAT_BINARY) || (info.m_format == FORMAT_UNKNOWN))
320 {
321 if (!detectBinary(input, info))
322 {
323 RVNGInputStreamPtr_t dir = queryTopDirStream(input);
324 if (dir)
325 detectBinary(dir, info);
326 }
327 }
328
329 if ((info.m_format == FORMAT_XML2) || (info.m_format == FORMAT_UNKNOWN))
330 {
331 info.m_package = input;
332
333 if ((info.m_type == EtonyekDocument::TYPE_KEYNOTE) || (info.m_type == EtonyekDocument::TYPE_UNKNOWN))
334 {
335 if (input->existsSubStream("index.apxl"))
336 {
337 info.m_format = FORMAT_XML2;
338 info.m_type = EtonyekDocument::TYPE_KEYNOTE;
339 info.m_input = getSubStream(input, "index.apxl");
340 }
341 else if (input->existsSubStream("index.apxl.gz"))
342 {
343 info.m_format = FORMAT_XML2;
344 info.m_type = EtonyekDocument::TYPE_KEYNOTE;
345 info.m_input = getUncompressedSubStream(input, "index.apxl.gz");
346 }
347 }
348
349 if ((info.m_type == EtonyekDocument::TYPE_NUMBERS) || (info.m_type == EtonyekDocument::TYPE_PAGES) || (info.m_type == EtonyekDocument::TYPE_UNKNOWN))
350 {
351 if (input->existsSubStream("index.xml"))
352 {
353 info.m_format = FORMAT_XML2;
354 info.m_input = getSubStream(input, "index.xml");
355 }
356 else if (input->existsSubStream("index.xml.gz"))
357 {
358 info.m_format = FORMAT_XML2;
359 info.m_input = getUncompressedSubStream(input, "index.xml.gz");
360 }
361 }
362 }
363
364 if (info.m_format == FORMAT_XML1 || info.m_format == FORMAT_UNKNOWN)
365 {
366 info.m_package = input;
367
368 if (input->existsSubStream("presentation.apxl"))
369 {
370 info.m_type = EtonyekDocument::TYPE_KEYNOTE;
371 info.m_format = FORMAT_XML1;
372 info.m_input = getSubStream(input, "presentation.apxl");
373 }
374 else if (input->existsSubStream("presentation.apxl.gz"))
375 {
376 info.m_type = EtonyekDocument::TYPE_KEYNOTE;
377 info.m_format = FORMAT_XML1;
378 info.m_input = getUncompressedSubStream(input, "presentation.apxl.gz");
379 }
380 }
381 }
382 else
383 {
384 try
385 {
386 info.m_input = std::make_shared<IWORKZlibStream>(input);
387 }
388 catch (...)
389 {
390 info.m_input = input;
391 }
392 }
393
394 if (bool(info.m_input))
395 {
396 assert(!info.m_input->isStructured());
397 info.m_input->seek(0, RVNG_SEEK_SET);
398
399 bool supported = false;
400 if (info.m_format == FORMAT_BINARY)
401 supported = probeBinary(info);
402 else
403 supported = probeXML(info);
404 if (supported)
405 info.m_confidence = bool(info.m_package) ? EtonyekDocument::CONFIDENCE_EXCELLENT : EtonyekDocument::CONFIDENCE_SUPPORTED_PART;
406 }
407
408 if (info.m_confidence != EtonyekDocument::CONFIDENCE_NONE)
409 {
410 assert(EtonyekDocument::TYPE_UNKNOWN != info.m_type);
411 assert(FORMAT_UNKNOWN != info.m_format);
412 assert(bool(info.m_input));
413 if (info.m_confidence == EtonyekDocument::CONFIDENCE_EXCELLENT)
414 {
415 assert(bool(info.m_package));
416 }
417 }
418
419 return info.m_confidence != EtonyekDocument::CONFIDENCE_NONE;
420 }
421
422 }
423
isSupported(librevenge::RVNGInputStream * const input,EtonyekDocument::Type * type)424 ETONYEKAPI EtonyekDocument::Confidence EtonyekDocument::isSupported(librevenge::RVNGInputStream *const input, EtonyekDocument::Type *type) try
425 {
426 if (!input)
427 return CONFIDENCE_NONE;
428
429 if (type)
430 *type = TYPE_UNKNOWN;
431
432 DetectionInfo info;
433
434 if (detect(RVNGInputStreamPtr_t(input, EtonyekDummyDeleter()), info))
435 {
436 if (type)
437 *type = info.m_type;
438 return info.m_confidence;
439 }
440
441 return CONFIDENCE_NONE;
442 }
443 catch (...)
444 {
445 return CONFIDENCE_NONE;
446 }
447
parse(librevenge::RVNGInputStream * const input,librevenge::RVNGPresentationInterface * const generator)448 ETONYEKAPI bool EtonyekDocument::parse(librevenge::RVNGInputStream *const input, librevenge::RVNGPresentationInterface *const generator) try
449 {
450 if (!input || !generator)
451 return false;
452
453 DetectionInfo info(EtonyekDocument::TYPE_KEYNOTE);
454
455 if (!detect(RVNGInputStreamPtr_t(input, EtonyekDummyDeleter()), info))
456 return false;
457
458 info.m_input->seek(0, librevenge::RVNG_SEEK_SET);
459
460 IWORKPresentationRedirector redirector(generator);
461 KEYCollector collector(&redirector);
462 if (info.m_format == FORMAT_XML1)
463 {
464 KEY1Dictionary dict;
465 input->seek(0, librevenge::RVNG_SEEK_SET);
466 shared_ptr<KEY1Parser> key1Parser(new KEY1Parser(info.m_input, info.m_package, collector, dict));
467 return key1Parser->parse();
468 }
469 else if (info.m_format == FORMAT_XML2)
470 {
471 KEY2Dictionary dict;
472 shared_ptr<KEY2Parser> key2Parser(new KEY2Parser(info.m_input, info.m_package, collector, dict));
473 return key2Parser->parse();
474 }
475 else if (info.m_format == FORMAT_BINARY)
476 {
477 KEY6Parser parser(info.m_fragments, info.m_package, collector);
478 return parser.parse();
479 }
480
481 ETONYEK_DEBUG_MSG(("EtonyekDocument::parse: unhandled format %d\n", info.m_format));
482 return false;
483 }
484 catch (...)
485 {
486 return false;
487 }
488
parse(librevenge::RVNGInputStream * const input,librevenge::RVNGSpreadsheetInterface * const document)489 ETONYEKAPI bool EtonyekDocument::parse(librevenge::RVNGInputStream *const input, librevenge::RVNGSpreadsheetInterface *const document) try
490 {
491 if (!input || !document)
492 return false;
493
494 DetectionInfo info(EtonyekDocument::TYPE_NUMBERS);
495
496 if (!detect(RVNGInputStreamPtr_t(input, EtonyekDummyDeleter()), info))
497 return false;
498
499 info.m_input->seek(0, librevenge::RVNG_SEEK_SET);
500
501 IWORKSpreadsheetRedirector redirector(document);
502 NUMCollector collector(&redirector);
503 if (info.m_format == FORMAT_XML2)
504 {
505 NUM1Dictionary dict;
506 NUM1Parser parser(info.m_input, info.m_package, collector, &dict);
507 return parser.parse();
508 }
509 else if (info.m_format == FORMAT_BINARY)
510 {
511 NUM3Parser parser(info.m_fragments, info.m_package, collector);
512 return parser.parse();
513 }
514
515 ETONYEK_DEBUG_MSG(("EtonyekDocument::parse: unhandled format %d\n", info.m_format));
516 return false;
517 }
518 catch (...)
519 {
520 return false;
521 }
522
parse(librevenge::RVNGInputStream * const input,librevenge::RVNGTextInterface * const document)523 ETONYEKAPI bool EtonyekDocument::parse(librevenge::RVNGInputStream *const input, librevenge::RVNGTextInterface *const document) try
524 {
525 if (!input || !document)
526 return false;
527
528 DetectionInfo info(EtonyekDocument::TYPE_PAGES);
529
530 if (!detect(RVNGInputStreamPtr_t(input, EtonyekDummyDeleter()), info))
531 return false;
532
533 info.m_input->seek(0, librevenge::RVNG_SEEK_SET);
534
535 IWORKTextRedirector redirector(document);
536 PAGCollector collector(&redirector);
537 if (info.m_format == FORMAT_XML2)
538 {
539 PAG1Dictionary dict;
540 PAG1Parser parser(info.m_input, info.m_package, collector, &dict);
541 return parser.parse();
542 }
543 else if (info.m_format == FORMAT_BINARY)
544 {
545 PAG5Parser parser(info.m_fragments, info.m_package, collector);
546 return parser.parse();
547 }
548
549 ETONYEK_DEBUG_MSG(("EtonyekDocument::parse: unhandled format %d\n", info.m_format));
550 return false;
551 }
552 catch (...)
553 {
554 return false;
555 }
556
557 }
558
559 /* vim:set shiftwidth=2 softtabstop=2 expandtab: */
560